blob: 54a3520802a4f317ccdb792765c4427689710d9a [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
Pablo Galindo293dd232019-11-19 21:34:03 +000014except ImportError:
Victor Stinner3d4226a2018-08-29 22:21:32 +020015 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Paul Monson62dfd7d2019-04-25 11:36:45 -070030# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
31def is_code_page_present(cp):
Victor Stinner8f4ef3b2019-07-01 18:28:25 +020032 from ctypes import POINTER, WINFUNCTYPE, WinDLL
Paul Monson62dfd7d2019-04-25 11:36:45 -070033 from ctypes.wintypes import BOOL, UINT, BYTE, WCHAR, UINT, DWORD
34
35 MAX_LEADBYTES = 12 # 5 ranges, 2 bytes ea., 0 term.
36 MAX_DEFAULTCHAR = 2 # single or double byte
37 MAX_PATH = 260
38 class CPINFOEXW(ctypes.Structure):
39 _fields_ = [("MaxCharSize", UINT),
40 ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
41 ("LeadByte", BYTE*MAX_LEADBYTES),
42 ("UnicodeDefaultChar", WCHAR),
43 ("CodePage", UINT),
44 ("CodePageName", WCHAR*MAX_PATH)]
45
46 prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
47 GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
48 info = CPINFOEXW()
49 return GetCPInfoEx(cp, 0, info)
Victor Stinnerf96418d2015-09-21 23:06:27 +020050
Walter Dörwald69652032004-09-07 20:24:22 +000051class Queue(object):
52 """
53 queue: write bytes at one end, read bytes from the other end
54 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055 def __init__(self, buffer):
56 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000057
58 def write(self, chars):
59 self._buffer += chars
60
61 def read(self, size=-1):
62 if size<0:
63 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000064 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000065 return s
66 else:
67 s = self._buffer[:size]
68 self._buffer = self._buffer[size:]
69 return s
70
Victor Stinnerf96418d2015-09-21 23:06:27 +020071
Walter Dörwald3abcb012007-04-16 22:10:50 +000072class MixInCheckStateHandling:
73 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementaldecoder(encoding)()
76 part1 = d.decode(s[:i])
77 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000078 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000079 # Check that the condition stated in the documentation for
80 # IncrementalDecoder.getstate() holds
81 if not state[1]:
82 # reset decoder to the default state without anything buffered
83 d.setstate((state[0][:0], 0))
84 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000085 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000086 # The decoder must return to the same state
87 self.assertEqual(state, d.getstate())
88 # Create a new decoder and set it to the state
89 # we extracted from the old one
90 d = codecs.getincrementaldecoder(encoding)()
91 d.setstate(state)
92 part2 = d.decode(s[i:], True)
93 self.assertEqual(u, part1+part2)
94
95 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000096 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000097 d = codecs.getincrementalencoder(encoding)()
98 part1 = d.encode(u[:i])
99 state = d.getstate()
100 d = codecs.getincrementalencoder(encoding)()
101 d.setstate(state)
102 part2 = d.encode(u[i:], True)
103 self.assertEqual(s, part1+part2)
104
Victor Stinnerf96418d2015-09-21 23:06:27 +0200105
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200106class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +0000108 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000109 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +0000110 # the StreamReader and check that the results equal the appropriate
111 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200113 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000114 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +0000117 result += r.read()
118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000120 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000122
Martin Panter7462b6492015-11-02 03:37:02 +0000123 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000125 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 self.assertEqual(result, partialresult)
129 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000130 self.assertEqual(d.decode(b"", True), "")
131 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000132
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000133 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000134 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000136 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000137 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000138 self.assertEqual(result, partialresult)
139 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000140 self.assertEqual(d.decode(b"", True), "")
141 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000142
143 # check iterdecode()
144 encoded = input.encode(self.encoding)
145 self.assertEqual(
146 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000147 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000148 )
149
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 def test_readline(self):
151 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000152 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 return codecs.getreader(self.encoding)(stream)
154
Walter Dörwaldca199432006-03-06 22:39:12 +0000155 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200156 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 lines = []
158 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000159 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160 if not line:
161 break
162 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000163 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000164
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
166 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
167 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000168 self.assertEqual(readalllines(s, True), sexpected)
169 self.assertEqual(readalllines(s, False), sexpectednoends)
170 self.assertEqual(readalllines(s, True, 10), sexpected)
171 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000172
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000174 # Test long lines (multiple calls to read() in readline())
175 vw = []
176 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200177 for (i, lineend) in enumerate(lineends):
178 vw.append((i*200+200)*"\u3042" + lineend)
179 vwo.append((i*200+200)*"\u3042")
180 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
181 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000182
183 # Test lines where the first read might end with \r, so the
184 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000185 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200186 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000187 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000189 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000190 self.assertEqual(
191 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000192 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000193 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200194 self.assertEqual(
195 reader.readline(keepends=True),
196 "xxx\n",
197 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000198 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000199 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000200 self.assertEqual(
201 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000202 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000203 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200204 self.assertEqual(
205 reader.readline(keepends=False),
206 "xxx",
207 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000208
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200209 def test_mixed_readline_and_read(self):
210 lines = ["Humpty Dumpty sat on a wall,\n",
211 "Humpty Dumpty had a great fall.\r\n",
212 "All the king's horses and all the king's men\r",
213 "Couldn't put Humpty together again."]
214 data = ''.join(lines)
215 def getreader():
216 stream = io.BytesIO(data.encode(self.encoding))
217 return codecs.getreader(self.encoding)(stream)
218
219 # Issue #8260: Test readline() followed by read()
220 f = getreader()
221 self.assertEqual(f.readline(), lines[0])
222 self.assertEqual(f.read(), ''.join(lines[1:]))
223 self.assertEqual(f.read(), '')
224
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200225 # Issue #32110: Test readline() followed by read(n)
226 f = getreader()
227 self.assertEqual(f.readline(), lines[0])
228 self.assertEqual(f.read(1), lines[1][0])
229 self.assertEqual(f.read(0), '')
230 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
231
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 # Issue #16636: Test readline() followed by readlines()
233 f = getreader()
234 self.assertEqual(f.readline(), lines[0])
235 self.assertEqual(f.readlines(), lines[1:])
236 self.assertEqual(f.read(), '')
237
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200238 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200239 f = getreader()
240 self.assertEqual(f.read(size=40, chars=5), data[:5])
241 self.assertEqual(f.read(), data[5:])
242 self.assertEqual(f.read(), '')
243
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200244 # Issue #32110: Test read(n) followed by read(n)
245 f = getreader()
246 self.assertEqual(f.read(size=40, chars=5), data[:5])
247 self.assertEqual(f.read(1), data[5])
248 self.assertEqual(f.read(0), '')
249 self.assertEqual(f.read(100), data[6:106])
250
251 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200252 f = getreader()
253 self.assertEqual(f.read(size=40, chars=5), data[:5])
254 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
255 self.assertEqual(f.read(), '')
256
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000257 def test_bug1175396(self):
258 s = [
259 '<%!--===================================================\r\n',
260 ' BLOG index page: show recent articles,\r\n',
261 ' today\'s articles, or articles of a specific date.\r\n',
262 '========================================================--%>\r\n',
263 '<%@inputencoding="ISO-8859-1"%>\r\n',
264 '<%@pagetemplate=TEMPLATE.y%>\r\n',
265 '<%@import=import frog.util, frog%>\r\n',
266 '<%@import=import frog.objects%>\r\n',
267 '<%@import=from frog.storageerrors import StorageError%>\r\n',
268 '<%\r\n',
269 '\r\n',
270 'import logging\r\n',
271 'log=logging.getLogger("Snakelets.logger")\r\n',
272 '\r\n',
273 '\r\n',
274 'user=self.SessionCtx.user\r\n',
275 'storageEngine=self.SessionCtx.storageEngine\r\n',
276 '\r\n',
277 '\r\n',
278 'def readArticlesFromDate(date, count=None):\r\n',
279 ' entryids=storageEngine.listBlogEntries(date)\r\n',
280 ' entryids.reverse() # descending\r\n',
281 ' if count:\r\n',
282 ' entryids=entryids[:count]\r\n',
283 ' try:\r\n',
284 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
285 ' except StorageError,x:\r\n',
286 ' log.error("Error loading articles: "+str(x))\r\n',
287 ' self.abort("cannot load articles")\r\n',
288 '\r\n',
289 'showdate=None\r\n',
290 '\r\n',
291 'arg=self.Request.getArg()\r\n',
292 'if arg=="today":\r\n',
293 ' #-------------------- TODAY\'S ARTICLES\r\n',
294 ' self.write("<h2>Today\'s articles</h2>")\r\n',
295 ' showdate = frog.util.isodatestr() \r\n',
296 ' entries = readArticlesFromDate(showdate)\r\n',
297 'elif arg=="active":\r\n',
298 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
299 ' self.Yredirect("active.y")\r\n',
300 'elif arg=="login":\r\n',
301 ' #-------------------- LOGIN PAGE redirect\r\n',
302 ' self.Yredirect("login.y")\r\n',
303 'elif arg=="date":\r\n',
304 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
305 ' showdate = self.Request.getParameter("date")\r\n',
306 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
307 ' entries = readArticlesFromDate(showdate)\r\n',
308 'else:\r\n',
309 ' #-------------------- RECENT ARTICLES\r\n',
310 ' self.write("<h2>Recent articles</h2>")\r\n',
311 ' dates=storageEngine.listBlogEntryDates()\r\n',
312 ' if dates:\r\n',
313 ' entries=[]\r\n',
314 ' SHOWAMOUNT=10\r\n',
315 ' for showdate in dates:\r\n',
316 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
317 ' if len(entries)>=SHOWAMOUNT:\r\n',
318 ' break\r\n',
319 ' \r\n',
320 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000323 for (i, line) in enumerate(reader):
324 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325
326 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200328 writer = codecs.getwriter(self.encoding)(q)
329 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330
331 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 writer.write("foo\r")
333 self.assertEqual(reader.readline(keepends=False), "foo")
334 writer.write("\nbar\r")
335 self.assertEqual(reader.readline(keepends=False), "")
336 self.assertEqual(reader.readline(keepends=False), "bar")
337 writer.write("baz")
338 self.assertEqual(reader.readline(keepends=False), "baz")
339 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000340
341 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 writer.write("foo\r")
343 self.assertEqual(reader.readline(keepends=True), "foo\r")
344 writer.write("\nbar\r")
345 self.assertEqual(reader.readline(keepends=True), "\n")
346 self.assertEqual(reader.readline(keepends=True), "bar\r")
347 writer.write("baz")
348 self.assertEqual(reader.readline(keepends=True), "baz")
349 self.assertEqual(reader.readline(keepends=True), "")
350 writer.write("foo\r\n")
351 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352
Walter Dörwald9fa09462005-01-10 12:01:39 +0000353 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000354 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
355 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
356 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
358 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000359 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200360 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000361 self.assertEqual(reader.readline(), s1)
362 self.assertEqual(reader.readline(), s2)
363 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000364 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000365
366 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000367 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
368 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
369 s3 = "stillokay:bbbbxx\r\n"
370 s4 = "broken!!!!badbad\r\n"
371 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000372
373 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000374 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200375 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000376 self.assertEqual(reader.readline(), s1)
377 self.assertEqual(reader.readline(), s2)
378 self.assertEqual(reader.readline(), s3)
379 self.assertEqual(reader.readline(), s4)
380 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000381 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 ill_formed_sequence_replace = "\ufffd"
384
385 def test_lone_surrogates(self):
386 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
387 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
388 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200389 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
390 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200391 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
392 "[&#56448;]".encode(self.encoding))
393 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
394 "[]".encode(self.encoding))
395 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
396 "[?]".encode(self.encoding))
397
Victor Stinner01ada392015-10-01 21:54:51 +0200398 # sequential surrogate characters
399 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
400 "[]".encode(self.encoding))
401 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
402 "[??]".encode(self.encoding))
403
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200404 bom = "".encode(self.encoding)
405 for before, after in [("\U00010fff", "A"), ("[", "]"),
406 ("A", "\U00010fff")]:
407 before_sequence = before.encode(self.encoding)[len(bom):]
408 after_sequence = after.encode(self.encoding)[len(bom):]
409 test_string = before + "\uDC80" + after
410 test_sequence = (bom + before_sequence +
411 self.ill_formed_sequence + after_sequence)
412 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
413 self.encoding)
414 self.assertEqual(test_string.encode(self.encoding,
415 "surrogatepass"),
416 test_sequence)
417 self.assertEqual(test_sequence.decode(self.encoding,
418 "surrogatepass"),
419 test_string)
420 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
421 before + after)
422 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
423 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200424 backslashreplace = ''.join('\\x%02x' % b
425 for b in self.ill_formed_sequence)
426 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
427 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200428
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200429 def test_incremental_surrogatepass(self):
430 # Test incremental decoder for surrogatepass handler:
431 # see issue #24214
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300432 # High surrogate
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200433 data = '\uD901'.encode(self.encoding, 'surrogatepass')
434 for i in range(1, len(data)):
435 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
436 self.assertEqual(dec.decode(data[:i]), '')
437 self.assertEqual(dec.decode(data[i:], True), '\uD901')
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300438 # Low surrogate
439 data = '\uDC02'.encode(self.encoding, 'surrogatepass')
440 for i in range(1, len(data)):
441 dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442 self.assertEqual(dec.decode(data[:i]), '')
443 self.assertEqual(dec.decode(data[i:]), '\uDC02')
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +0200444
Victor Stinnerf96418d2015-09-21 23:06:27 +0200445
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200446class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200448 if sys.byteorder == 'little':
449 ill_formed_sequence = b"\x80\xdc\x00\x00"
450 else:
451 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452
453 spamle = (b'\xff\xfe\x00\x00'
454 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
455 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
456 spambe = (b'\x00\x00\xfe\xff'
457 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
458 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
459
460 def test_only_one_bom(self):
461 _,_,reader,writer = codecs.lookup(self.encoding)
462 # encode some stream
463 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200464 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 f.write("spam")
466 f.write("spam")
467 d = s.getvalue()
468 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000469 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 # try to read it back
471 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200472 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000473 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474
475 def test_badbom(self):
476 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200477 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 self.assertRaises(UnicodeError, f.read)
479
480 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200481 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 self.assertRaises(UnicodeError, f.read)
483
484 def test_partial(self):
485 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200486 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 [
488 "", # first byte of BOM read
489 "", # second byte of BOM read
490 "", # third byte of BOM read
491 "", # fourth byte of BOM read => byteorder known
492 "",
493 "",
494 "",
495 "\x00",
496 "\x00",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200508 "\x00\xff\u0100\uffff",
509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000512 ]
513 )
514
Georg Brandl791f4e12009-09-17 11:41:24 +0000515 def test_handlers(self):
516 self.assertEqual(('\ufffd', 1),
517 codecs.utf_32_decode(b'\x01', 'replace', True))
518 self.assertEqual(('', 1),
519 codecs.utf_32_decode(b'\x01', 'ignore', True))
520
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 def test_errors(self):
522 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
523 b"\xff", "strict", True)
524
525 def test_decoder_state(self):
526 self.check_state_handling_decode(self.encoding,
527 "spamspam", self.spamle)
528 self.check_state_handling_decode(self.encoding,
529 "spamspam", self.spambe)
530
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000531 def test_issue8941(self):
532 # Issue #8941: insufficient result allocation when decoding into
533 # surrogate pairs on UCS-2 builds.
534 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
535 self.assertEqual('\U00010000' * 1024,
536 codecs.utf_32_decode(encoded_le)[0])
537 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
538 self.assertEqual('\U00010000' * 1024,
539 codecs.utf_32_decode(encoded_be)[0])
540
Victor Stinnerf96418d2015-09-21 23:06:27 +0200541
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200542class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000543 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200544 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 [
550 "",
551 "",
552 "",
553 "\x00",
554 "\x00",
555 "\x00",
556 "\x00",
557 "\x00\xff",
558 "\x00\xff",
559 "\x00\xff",
560 "\x00\xff",
561 "\x00\xff\u0100",
562 "\x00\xff\u0100",
563 "\x00\xff\u0100",
564 "\x00\xff\u0100",
565 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200566 "\x00\xff\u0100\uffff",
567 "\x00\xff\u0100\uffff",
568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000570 ]
571 )
572
573 def test_simple(self):
574 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
575
576 def test_errors(self):
577 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
578 b"\xff", "strict", True)
579
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000580 def test_issue8941(self):
581 # Issue #8941: insufficient result allocation when decoding into
582 # surrogate pairs on UCS-2 builds.
583 encoded = b'\x00\x00\x01\x00' * 1024
584 self.assertEqual('\U00010000' * 1024,
585 codecs.utf_32_le_decode(encoded)[0])
586
Victor Stinnerf96418d2015-09-21 23:06:27 +0200587
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200588class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000589 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200590 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000591
592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000595 [
596 "",
597 "",
598 "",
599 "\x00",
600 "\x00",
601 "\x00",
602 "\x00",
603 "\x00\xff",
604 "\x00\xff",
605 "\x00\xff",
606 "\x00\xff",
607 "\x00\xff\u0100",
608 "\x00\xff\u0100",
609 "\x00\xff\u0100",
610 "\x00\xff\u0100",
611 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200612 "\x00\xff\u0100\uffff",
613 "\x00\xff\u0100\uffff",
614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000616 ]
617 )
618
619 def test_simple(self):
620 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
621
622 def test_errors(self):
623 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
624 b"\xff", "strict", True)
625
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000626 def test_issue8941(self):
627 # Issue #8941: insufficient result allocation when decoding into
628 # surrogate pairs on UCS-2 builds.
629 encoded = b'\x00\x01\x00\x00' * 1024
630 self.assertEqual('\U00010000' * 1024,
631 codecs.utf_32_be_decode(encoded)[0])
632
633
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200634class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000635 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200636 if sys.byteorder == 'little':
637 ill_formed_sequence = b"\x80\xdc"
638 else:
639 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000640
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
642 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000643
644 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000646 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000647 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200648 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000649 f.write("spam")
650 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000651 d = s.getvalue()
652 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000653 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000654 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000655 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200656 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000658
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000659 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000660 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200661 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000662 self.assertRaises(UnicodeError, f.read)
663
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000664 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200665 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000666 self.assertRaises(UnicodeError, f.read)
667
Walter Dörwald69652032004-09-07 20:24:22 +0000668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "", # first byte of BOM read
673 "", # second byte of BOM read => byteorder known
674 "",
675 "\x00",
676 "\x00",
677 "\x00\xff",
678 "\x00\xff",
679 "\x00\xff\u0100",
680 "\x00\xff\u0100",
681 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff",
684 "\x00\xff\u0100\uffff",
685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 ]
687 )
688
Georg Brandl791f4e12009-09-17 11:41:24 +0000689 def test_handlers(self):
690 self.assertEqual(('\ufffd', 1),
691 codecs.utf_16_decode(b'\x01', 'replace', True))
692 self.assertEqual(('', 1),
693 codecs.utf_16_decode(b'\x01', 'ignore', True))
694
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000696 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000697 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000698
699 def test_decoder_state(self):
700 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000701 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000702 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000703 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000704
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000705 def test_bug691291(self):
706 # Files are always opened in binary mode, even if no binary mode was
707 # specified. This means that no automatic conversion of '\n' is done
708 # on reading and writing.
709 s1 = 'Hello\r\nworld\r\n'
710
711 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200712 self.addCleanup(support.unlink, support.TESTFN)
713 with open(support.TESTFN, 'wb') as fp:
714 fp.write(s)
Victor Stinner942f7a22020-03-04 18:50:22 +0100715 with support.check_warnings(('', DeprecationWarning)):
716 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
717 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200718 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000719
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200720class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000721 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200722 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000723
724 def test_partial(self):
725 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200726 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000727 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000728 "",
729 "\x00",
730 "\x00",
731 "\x00\xff",
732 "\x00\xff",
733 "\x00\xff\u0100",
734 "\x00\xff\u0100",
735 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200736 "\x00\xff\u0100\uffff",
737 "\x00\xff\u0100\uffff",
738 "\x00\xff\u0100\uffff",
739 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000740 ]
741 )
742
Walter Dörwalde22d3392005-11-17 08:52:34 +0000743 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200744 tests = [
745 (b'\xff', '\ufffd'),
746 (b'A\x00Z', 'A\ufffd'),
747 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
748 (b'\x00\xd8', '\ufffd'),
749 (b'\x00\xd8A', '\ufffd'),
750 (b'\x00\xd8A\x00', '\ufffdA'),
751 (b'\x00\xdcA\x00', '\ufffdA'),
752 ]
753 for raw, expected in tests:
754 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
755 raw, 'strict', True)
756 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000757
Victor Stinner53a9dd72010-12-08 22:25:45 +0000758 def test_nonbmp(self):
759 self.assertEqual("\U00010203".encode(self.encoding),
760 b'\x00\xd8\x03\xde')
761 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
762 "\U00010203")
763
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200764class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000765 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200766 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000767
768 def test_partial(self):
769 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200770 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000771 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000772 "",
773 "\x00",
774 "\x00",
775 "\x00\xff",
776 "\x00\xff",
777 "\x00\xff\u0100",
778 "\x00\xff\u0100",
779 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200780 "\x00\xff\u0100\uffff",
781 "\x00\xff\u0100\uffff",
782 "\x00\xff\u0100\uffff",
783 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000784 ]
785 )
786
Walter Dörwalde22d3392005-11-17 08:52:34 +0000787 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200788 tests = [
789 (b'\xff', '\ufffd'),
790 (b'\x00A\xff', 'A\ufffd'),
791 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
792 (b'\xd8\x00', '\ufffd'),
793 (b'\xd8\x00\xdc', '\ufffd'),
794 (b'\xd8\x00\x00A', '\ufffdA'),
795 (b'\xdc\x00\x00A', '\ufffdA'),
796 ]
797 for raw, expected in tests:
798 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
799 raw, 'strict', True)
800 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000801
Victor Stinner53a9dd72010-12-08 22:25:45 +0000802 def test_nonbmp(self):
803 self.assertEqual("\U00010203".encode(self.encoding),
804 b'\xd8\x00\xde\x03')
805 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
806 "\U00010203")
807
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200808class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000809 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200810 ill_formed_sequence = b"\xed\xb2\x80"
811 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200812 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000813
814 def test_partial(self):
815 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200816 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000817 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000818 "\x00",
819 "\x00",
820 "\x00\xff",
821 "\x00\xff",
822 "\x00\xff\u07ff",
823 "\x00\xff\u07ff",
824 "\x00\xff\u07ff",
825 "\x00\xff\u07ff\u0800",
826 "\x00\xff\u07ff\u0800",
827 "\x00\xff\u07ff\u0800",
828 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200829 "\x00\xff\u07ff\u0800\uffff",
830 "\x00\xff\u07ff\u0800\uffff",
831 "\x00\xff\u07ff\u0800\uffff",
832 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000833 ]
834 )
835
Walter Dörwald3abcb012007-04-16 22:10:50 +0000836 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000837 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000838 self.check_state_handling_decode(self.encoding,
839 u, u.encode(self.encoding))
840
Victor Stinner1d65d912015-10-05 13:43:50 +0200841 def test_decode_error(self):
842 for data, error_handler, expected in (
843 (b'[\x80\xff]', 'ignore', '[]'),
844 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
845 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
846 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
847 ):
848 with self.subTest(data=data, error_handler=error_handler,
849 expected=expected):
850 self.assertEqual(data.decode(self.encoding, error_handler),
851 expected)
852
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000853 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200854 super().test_lone_surrogates()
855 # not sure if this is making sense for
856 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200857 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
858 self.BOM + b'[\x80]')
859
860 with self.assertRaises(UnicodeEncodeError) as cm:
861 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
862 exc = cm.exception
863 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000864
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000865 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200866 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
867 self.BOM + b"abc\xed\xa0\x80def")
868 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
869 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
870 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
871 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
872
873 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000874 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200875 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200877
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000878 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700879 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200880 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200881 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200882 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000883
Serhiy Storchaka894263b2019-06-25 11:54:18 +0300884 def test_incremental_errors(self):
885 # Test that the incremental decoder can fail with final=False.
886 # See issue #24214
887 cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
888 for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
889 b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
890 b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
891 for suffix in b'\x7F', b'\xC0':
892 cases.append(prefix + suffix)
893 cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
894 b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
895
896 for data in cases:
897 with self.subTest(data=data):
898 dec = codecs.getincrementaldecoder(self.encoding)()
899 self.assertRaises(UnicodeDecodeError, dec.decode, data)
900
Victor Stinnerf96418d2015-09-21 23:06:27 +0200901
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200902class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000903 encoding = "utf-7"
904
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300905 def test_ascii(self):
906 # Set D (directly encoded characters)
907 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
908 'abcdefghijklmnopqrstuvwxyz'
909 '0123456789'
910 '\'(),-./:?')
911 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
912 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
913 # Set O (optional direct characters)
914 set_o = ' !"#$%&*;<=>@[]^_`{|}'
915 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
916 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
917 # +
918 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
919 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
920 # White spaces
921 ws = ' \t\n\r'
922 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
923 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
924 # Other ASCII characters
925 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
926 set(set_d + set_o + '+' + ws)))
927 self.assertEqual(other_ascii.encode(self.encoding),
928 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
929 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
930
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000931 def test_partial(self):
932 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200933 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000934 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200935 'a',
936 'a',
937 'a+',
938 'a+-',
939 'a+-b',
940 'a+-b',
941 'a+-b',
942 'a+-b',
943 'a+-b',
944 'a+-b\x00',
945 'a+-b\x00c',
946 'a+-b\x00c',
947 'a+-b\x00c',
948 'a+-b\x00c',
949 'a+-b\x00c',
950 'a+-b\x00c\x80',
951 'a+-b\x00c\x80d',
952 'a+-b\x00c\x80d',
953 'a+-b\x00c\x80d',
954 'a+-b\x00c\x80d',
955 'a+-b\x00c\x80d',
956 'a+-b\x00c\x80d\u0100',
957 'a+-b\x00c\x80d\u0100e',
958 'a+-b\x00c\x80d\u0100e',
959 'a+-b\x00c\x80d\u0100e',
960 'a+-b\x00c\x80d\u0100e',
961 'a+-b\x00c\x80d\u0100e',
962 'a+-b\x00c\x80d\u0100e',
963 'a+-b\x00c\x80d\u0100e',
964 'a+-b\x00c\x80d\u0100e',
965 'a+-b\x00c\x80d\u0100e\U00010000',
966 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 ]
968 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000969
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300970 def test_errors(self):
971 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300972 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300973 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300974 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300975 (b'a+IK', 'a\ufffd'),
976 (b'a+IK-b', 'a\ufffdb'),
977 (b'a+IK,b', 'a\ufffdb'),
978 (b'a+IKx', 'a\u20ac\ufffd'),
979 (b'a+IKx-b', 'a\u20ac\ufffdb'),
980 (b'a+IKwgr', 'a\u20ac\ufffd'),
981 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
982 (b'a+IKwgr,', 'a\u20ac\ufffd'),
983 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
984 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
985 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
986 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
987 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
988 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
989 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300990 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
991 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -0600992 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300993 ]
994 for raw, expected in tests:
995 with self.subTest(raw=raw):
996 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
997 raw, 'strict', True)
998 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
999
1000 def test_nonbmp(self):
1001 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1002 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1003 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001004 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1005 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1006 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1007 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1008 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1009 b'+IKwgrNgB3KA-')
1010 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1011 '\u20ac\u20ac\U000104A0')
1012 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1013 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001014
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001015 def test_lone_surrogates(self):
1016 tests = [
1017 (b'a+2AE-b', 'a\ud801b'),
1018 (b'a+2AE\xffb', 'a\ufffdb'),
1019 (b'a+2AE', 'a\ufffd'),
1020 (b'a+2AEA-b', 'a\ufffdb'),
1021 (b'a+2AH-b', 'a\ufffdb'),
1022 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1023 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1024 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1025 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1026 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1027 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1028 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1029 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1030 ]
1031 for raw, expected in tests:
1032 with self.subTest(raw=raw):
1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001034
1035
Walter Dörwalde22d3392005-11-17 08:52:34 +00001036class UTF16ExTest(unittest.TestCase):
1037
1038 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001039 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001040
1041 def test_bad_args(self):
1042 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1043
1044class ReadBufferTest(unittest.TestCase):
1045
1046 def test_array(self):
1047 import array
1048 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001049 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001050 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001051 )
1052
1053 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001054 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001055
1056 def test_bad_args(self):
1057 self.assertRaises(TypeError, codecs.readbuffer_encode)
1058 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1059
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001060class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001061 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001062 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001063
1064 def test_partial(self):
1065 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001066 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001067 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 "",
1069 "",
1070 "", # First BOM has been read and skipped
1071 "",
1072 "",
1073 "\ufeff", # Second BOM has been read and emitted
1074 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001075 "\ufeff\x00", # First byte of encoded "\xff" read
1076 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1077 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1078 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001079 "\ufeff\x00\xff\u07ff",
1080 "\ufeff\x00\xff\u07ff",
1081 "\ufeff\x00\xff\u07ff\u0800",
1082 "\ufeff\x00\xff\u07ff\u0800",
1083 "\ufeff\x00\xff\u07ff\u0800",
1084 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001085 "\ufeff\x00\xff\u07ff\u0800\uffff",
1086 "\ufeff\x00\xff\u07ff\u0800\uffff",
1087 "\ufeff\x00\xff\u07ff\u0800\uffff",
1088 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001089 ]
1090 )
1091
Thomas Wouters89f507f2006-12-13 04:49:30 +00001092 def test_bug1601501(self):
1093 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001094 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001095
Walter Dörwald3abcb012007-04-16 22:10:50 +00001096 def test_bom(self):
1097 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001098 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001099 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1100
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001101 def test_stream_bom(self):
1102 unistring = "ABC\u00A1\u2200XYZ"
1103 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1104
1105 reader = codecs.getreader("utf-8-sig")
1106 for sizehint in [None] + list(range(1, 11)) + \
1107 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001108 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001109 ostream = io.StringIO()
1110 while 1:
1111 if sizehint is not None:
1112 data = istream.read(sizehint)
1113 else:
1114 data = istream.read()
1115
1116 if not data:
1117 break
1118 ostream.write(data)
1119
1120 got = ostream.getvalue()
1121 self.assertEqual(got, unistring)
1122
1123 def test_stream_bare(self):
1124 unistring = "ABC\u00A1\u2200XYZ"
1125 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1126
1127 reader = codecs.getreader("utf-8-sig")
1128 for sizehint in [None] + list(range(1, 11)) + \
1129 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001130 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001131 ostream = io.StringIO()
1132 while 1:
1133 if sizehint is not None:
1134 data = istream.read(sizehint)
1135 else:
1136 data = istream.read()
1137
1138 if not data:
1139 break
1140 ostream.write(data)
1141
1142 got = ostream.getvalue()
1143 self.assertEqual(got, unistring)
1144
Chris A2565ede2020-03-02 01:39:50 -05001145
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001146class EscapeDecodeTest(unittest.TestCase):
1147 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001148 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001149 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001150
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001151 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001152 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001153 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001154 b = bytes([b])
1155 if b != b'\\':
1156 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001157
1158 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001159 decode = codecs.escape_decode
1160 check = coding_checker(self, decode)
1161 check(b"[\\\n]", b"[]")
1162 check(br'[\"]', b'["]')
1163 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001164 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001165 check(br"[\a]", b"[\x07]")
1166 check(br"[\b]", b"[\x08]")
1167 check(br"[\t]", b"[\x09]")
1168 check(br"[\n]", b"[\x0a]")
1169 check(br"[\v]", b"[\x0b]")
1170 check(br"[\f]", b"[\x0c]")
1171 check(br"[\r]", b"[\x0d]")
1172 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001173 check(br"[\78]", b"[\x078]")
1174 check(br"[\41]", b"[!]")
1175 check(br"[\418]", b"[!8]")
1176 check(br"[\101]", b"[A]")
1177 check(br"[\1010]", b"[A0]")
1178 check(br"[\501]", b"[A]")
1179 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001180 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001181 for i in range(97, 123):
1182 b = bytes([i])
1183 if b not in b'abfnrtvx':
1184 with self.assertWarns(DeprecationWarning):
1185 check(b"\\" + b, b"\\" + b)
1186 with self.assertWarns(DeprecationWarning):
1187 check(b"\\" + b.upper(), b"\\" + b.upper())
1188 with self.assertWarns(DeprecationWarning):
1189 check(br"\8", b"\\8")
1190 with self.assertWarns(DeprecationWarning):
1191 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001192 with self.assertWarns(DeprecationWarning):
1193 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001194
1195 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001196 decode = codecs.escape_decode
1197 self.assertRaises(ValueError, decode, br"\x")
1198 self.assertRaises(ValueError, decode, br"[\x]")
1199 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1200 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1201 self.assertRaises(ValueError, decode, br"\x0")
1202 self.assertRaises(ValueError, decode, br"[\x0]")
1203 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1204 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001205
Victor Stinnerf96418d2015-09-21 23:06:27 +02001206
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207# From RFC 3492
1208punycode_testcases = [
1209 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001210 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1211 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001212 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001214 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001215 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001217 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001218 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1221 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1222 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1226 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1227 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001228 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001230 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001231 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1232 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1233 "\u0939\u0948\u0902",
1234 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235
1236 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001238 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1239 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240
1241 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001242 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1243 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1244 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001245 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1246 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247
1248 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1250 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1251 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1252 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001253 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254
1255 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001256 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1257 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1258 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1259 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1260 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262
1263 # (K) Vietnamese:
1264 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1265 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001266 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1267 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1268 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1269 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001270 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001273 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001274 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001275
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1278 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1279 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001280 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281
1282 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001283 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1284 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1285 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001286 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287
1288 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001289 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001290 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291
1292 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001293 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1294 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001295 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001296
1297 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001298 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001299 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001300
1301 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001302 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001303 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304
1305 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001306 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1307 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001308 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309 ]
1310
1311for i in punycode_testcases:
1312 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001313 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314
Victor Stinnerf96418d2015-09-21 23:06:27 +02001315
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316class PunycodeTest(unittest.TestCase):
1317 def test_encode(self):
1318 for uni, puny in punycode_testcases:
1319 # Need to convert both strings to lower case, since
1320 # some of the extended encodings use upper case, but our
1321 # code produces only lower case. Converting just puny to
1322 # lower is also insufficient, since some of the input characters
1323 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001324 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001325 str(uni.encode("punycode"), "ascii").lower(),
1326 str(puny, "ascii").lower()
1327 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328
1329 def test_decode(self):
1330 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001331 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001332 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001333 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334
Berker Peksagba22e8f2020-02-25 06:19:03 +03001335 def test_decode_invalid(self):
1336 testcases = [
1337 (b"xn--w&", "strict", UnicodeError()),
1338 (b"xn--w&", "ignore", "xn-"),
1339 ]
1340 for puny, errors, expected in testcases:
1341 with self.subTest(puny=puny, errors=errors):
1342 if isinstance(expected, Exception):
1343 self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1344 else:
1345 self.assertEqual(puny.decode("punycode", errors), expected)
1346
Victor Stinnerf96418d2015-09-21 23:06:27 +02001347
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1349nameprep_tests = [
1350 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1352 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1353 b'\xb8\x8f\xef\xbb\xbf',
1354 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'CAFE',
1357 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.3 Case folding 8bit U+00DF (german sharp s).
1359 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xc3\x9f',
1361 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\xc4\xb0',
1364 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'\xc5\x83\xcd\xba',
1367 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1369 # XXX: skip this as it fails in UCS-2 mode
1370 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1371 # 'telc\xe2\x88\x95kg\xcf\x83'),
1372 (None, None),
1373 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1375 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xe1\xbe\xb7',
1378 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.9 Self-reverting case folding U+01F0 and normalization.
1380 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xc7\xb0',
1382 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001383 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xce\x90',
1385 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xce\xb0',
1388 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xe1\xba\x96',
1391 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'\xe1\xbd\x96',
1394 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b' ',
1397 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xc2\xa0',
1400 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 None),
1404 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xe2\x80\x80',
1406 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'\xe2\x80\x8b',
1409 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\xe3\x80\x80',
1412 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\x10\x7f',
1415 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 None),
1419 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 None),
1422 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\xef\xbb\xbf',
1424 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 None),
1428 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 None),
1431 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001432 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 None),
1434 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001435 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 None),
1437 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 None),
1440 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 None),
1443 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 None),
1446 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 None),
1449 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 None),
1452 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xcd\x81',
1454 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 None),
1458 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 None),
1464 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 None),
1467 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 None),
1470 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 None),
1473 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'foo\xef\xb9\xb6bar',
1475 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 None),
1479 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'\xd8\xa71\xd8\xa8',
1481 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001483 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001485 # None),
1486 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 # 3.44 Larger test (shrinking).
1488 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1490 b'\xaa\xce\xb0\xe2\x80\x80',
1491 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.45 Larger test (expanding).
1493 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1495 b'\x80',
1496 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1497 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1498 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 ]
1500
1501
1502class NameprepTest(unittest.TestCase):
1503 def test_nameprep(self):
1504 from encodings.idna import nameprep
1505 for pos, (orig, prepped) in enumerate(nameprep_tests):
1506 if orig is None:
1507 # Skipped
1508 continue
1509 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001510 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001511 if prepped is None:
1512 # Input contains prohibited characters
1513 self.assertRaises(UnicodeError, nameprep, orig)
1514 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001515 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001517 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001518 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001519 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520
Victor Stinnerf96418d2015-09-21 23:06:27 +02001521
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001522class IDNACodecTest(unittest.TestCase):
1523 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001524 self.assertEqual(str(b"python.org", "idna"), "python.org")
1525 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1526 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1527 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001528
1529 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001530 self.assertEqual("python.org".encode("idna"), b"python.org")
1531 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1532 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1533 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001534
Martin v. Löwis8b595142005-08-25 11:03:38 +00001535 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001536 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001537 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001540 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001542 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001543 "python.org"
1544 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001546 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001547 "python.org."
1548 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001550 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 "pyth\xf6n.org."
1552 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001553 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001554 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001555 "pyth\xf6n.org."
1556 )
1557
1558 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001559 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1560 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1561 self.assertEqual(decoder.decode(b"rg"), "")
1562 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001563
1564 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001565 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1566 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1567 self.assertEqual(decoder.decode(b"rg."), "org.")
1568 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001569
1570 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001571 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001572 b"".join(codecs.iterencode("python.org", "idna")),
1573 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001574 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 b"".join(codecs.iterencode("python.org.", "idna")),
1577 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001578 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1581 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001582 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001583 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001584 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1585 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001586 )
1587
1588 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001589 self.assertEqual(encoder.encode("\xe4x"), b"")
1590 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1591 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001592
1593 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(encoder.encode("\xe4x"), b"")
1595 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1596 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001597
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001598 def test_errors(self):
1599 """Only supports "strict" error handler"""
1600 "python.org".encode("idna", "strict")
1601 b"python.org".decode("idna", "strict")
1602 for errors in ("ignore", "replace", "backslashreplace",
1603 "surrogateescape"):
1604 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1605 self.assertRaises(Exception,
1606 b"python.org".decode, "idna", errors)
1607
Victor Stinnerf96418d2015-09-21 23:06:27 +02001608
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001609class CodecsModuleTest(unittest.TestCase):
1610
1611 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1613 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001614 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001615 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001616 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001617
Victor Stinnera57dfd02014-05-14 17:13:14 +02001618 # test keywords
1619 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1620 '\xe4\xf6\xfc')
1621 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1622 '[]')
1623
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001624 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001625 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1626 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001627 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001628 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001629 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001630 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001631
Victor Stinnera57dfd02014-05-14 17:13:14 +02001632 # test keywords
1633 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1634 b'\xe4\xf6\xfc')
1635 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1636 b'[]')
1637
Walter Dörwald063e1e82004-10-28 13:04:26 +00001638 def test_register(self):
1639 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001640 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001641
1642 def test_lookup(self):
1643 self.assertRaises(TypeError, codecs.lookup)
1644 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001645 self.assertRaises(LookupError, codecs.lookup, " ")
1646
1647 def test_getencoder(self):
1648 self.assertRaises(TypeError, codecs.getencoder)
1649 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1650
1651 def test_getdecoder(self):
1652 self.assertRaises(TypeError, codecs.getdecoder)
1653 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1654
1655 def test_getreader(self):
1656 self.assertRaises(TypeError, codecs.getreader)
1657 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1658
1659 def test_getwriter(self):
1660 self.assertRaises(TypeError, codecs.getwriter)
1661 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001662
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001663 def test_lookup_issue1813(self):
1664 # Issue #1813: under Turkish locales, lookup of some codecs failed
1665 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001666 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001667 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1668 try:
1669 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1670 except locale.Error:
1671 # Unsupported locale on this system
1672 self.skipTest('test needs Turkish locale')
1673 c = codecs.lookup('ASCII')
1674 self.assertEqual(c.name, 'ascii')
1675
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001676 def test_all(self):
1677 api = (
1678 "encode", "decode",
1679 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1680 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1681 "getencoder", "getdecoder", "getincrementalencoder",
1682 "getincrementaldecoder", "getreader", "getwriter",
1683 "register_error", "lookup_error",
1684 "strict_errors", "replace_errors", "ignore_errors",
1685 "xmlcharrefreplace_errors", "backslashreplace_errors",
1686 "namereplace_errors",
1687 "open", "EncodedFile",
1688 "iterencode", "iterdecode",
1689 "BOM", "BOM_BE", "BOM_LE",
1690 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1691 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1692 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1693 "StreamReaderWriter", "StreamRecoder",
1694 )
1695 self.assertCountEqual(api, codecs.__all__)
1696 for api in codecs.__all__:
1697 getattr(codecs, api)
1698
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001699 def test_open(self):
1700 self.addCleanup(support.unlink, support.TESTFN)
1701 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1702 with self.subTest(mode), \
1703 codecs.open(support.TESTFN, mode, 'ascii') as file:
1704 self.assertIsInstance(file, codecs.StreamReaderWriter)
1705
1706 def test_undefined(self):
1707 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1708 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1709 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1710 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1711 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1712 self.assertRaises(UnicodeError,
1713 codecs.encode, 'abc', 'undefined', errors)
1714 self.assertRaises(UnicodeError,
1715 codecs.decode, b'abc', 'undefined', errors)
1716
Chris A2565ede2020-03-02 01:39:50 -05001717 def test_file_closes_if_lookup_error_raised(self):
1718 mock_open = mock.mock_open()
1719 with mock.patch('builtins.open', mock_open) as file:
1720 with self.assertRaises(LookupError):
1721 codecs.open(support.TESTFN, 'wt', 'invalid-encoding')
1722
1723 file().close.assert_called()
1724
Victor Stinnerf96418d2015-09-21 23:06:27 +02001725
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001726class StreamReaderTest(unittest.TestCase):
1727
1728 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001729 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001730 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001731
1732 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001733 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001734 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001735
Victor Stinnerf96418d2015-09-21 23:06:27 +02001736
Thomas Wouters89f507f2006-12-13 04:49:30 +00001737class EncodedFileTest(unittest.TestCase):
1738
1739 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001740 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001741 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001742 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001743
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001744 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001745 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001746 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001747 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001748
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749all_unicode_encodings = [
1750 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001751 "big5",
1752 "big5hkscs",
1753 "charmap",
1754 "cp037",
1755 "cp1006",
1756 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001757 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001758 "cp1140",
1759 "cp1250",
1760 "cp1251",
1761 "cp1252",
1762 "cp1253",
1763 "cp1254",
1764 "cp1255",
1765 "cp1256",
1766 "cp1257",
1767 "cp1258",
1768 "cp424",
1769 "cp437",
1770 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001771 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001772 "cp737",
1773 "cp775",
1774 "cp850",
1775 "cp852",
1776 "cp855",
1777 "cp856",
1778 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001779 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001780 "cp860",
1781 "cp861",
1782 "cp862",
1783 "cp863",
1784 "cp864",
1785 "cp865",
1786 "cp866",
1787 "cp869",
1788 "cp874",
1789 "cp875",
1790 "cp932",
1791 "cp949",
1792 "cp950",
1793 "euc_jis_2004",
1794 "euc_jisx0213",
1795 "euc_jp",
1796 "euc_kr",
1797 "gb18030",
1798 "gb2312",
1799 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001800 "hp_roman8",
1801 "hz",
1802 "idna",
1803 "iso2022_jp",
1804 "iso2022_jp_1",
1805 "iso2022_jp_2",
1806 "iso2022_jp_2004",
1807 "iso2022_jp_3",
1808 "iso2022_jp_ext",
1809 "iso2022_kr",
1810 "iso8859_1",
1811 "iso8859_10",
1812 "iso8859_11",
1813 "iso8859_13",
1814 "iso8859_14",
1815 "iso8859_15",
1816 "iso8859_16",
1817 "iso8859_2",
1818 "iso8859_3",
1819 "iso8859_4",
1820 "iso8859_5",
1821 "iso8859_6",
1822 "iso8859_7",
1823 "iso8859_8",
1824 "iso8859_9",
1825 "johab",
1826 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001827 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001828 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001829 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001830 "latin_1",
1831 "mac_cyrillic",
1832 "mac_greek",
1833 "mac_iceland",
1834 "mac_latin2",
1835 "mac_roman",
1836 "mac_turkish",
1837 "palmos",
1838 "ptcp154",
1839 "punycode",
1840 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001841 "shift_jis",
1842 "shift_jis_2004",
1843 "shift_jisx0213",
1844 "tis_620",
1845 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001846 "utf_16",
1847 "utf_16_be",
1848 "utf_16_le",
1849 "utf_7",
1850 "utf_8",
1851]
1852
1853if hasattr(codecs, "mbcs_encode"):
1854 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001855if hasattr(codecs, "oem_encode"):
1856 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001857
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858# The following encoding is not tested, because it's not supposed
1859# to work:
1860# "undefined"
1861
1862# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001863broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001864 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001865]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001866
Victor Stinnerf96418d2015-09-21 23:06:27 +02001867
Walter Dörwald3abcb012007-04-16 22:10:50 +00001868class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001869 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001870 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001871 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001872 name = codecs.lookup(encoding).name
1873 if encoding.endswith("_codec"):
1874 name += "_codec"
1875 elif encoding == "latin_1":
1876 name = "latin_1"
1877 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001878
Inada Naoki6a16b182019-03-18 15:44:11 +09001879 (b, size) = codecs.getencoder(encoding)(s)
1880 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1881 (chars, size) = codecs.getdecoder(encoding)(b)
1882 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001883
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001884 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001885 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001886 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001887 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001888 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001889 for c in s:
1890 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001891 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001892 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001893 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001894 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001895 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001896 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001898 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001899 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001900 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001901
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001902 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001903 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001904 try:
1905 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001906 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001907 pass
1908 else:
1909 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001910 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001911 for c in s:
1912 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001913 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001914 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001915 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001916 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001917 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001918 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001919 self.assertEqual(decodedresult, s,
1920 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001921
1922 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001923 result = "".join(codecs.iterdecode(
1924 codecs.iterencode(s, encoding), encoding))
1925 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001926
1927 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001928 result = "".join(codecs.iterdecode(
1929 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001930 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001931
Victor Stinner554f3f02010-06-16 23:33:54 +00001932 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001933 # check incremental decoder/encoder with errors argument
1934 try:
1935 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001936 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001937 pass
1938 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001939 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001940 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001941 decodedresult = "".join(decoder.decode(bytes([c]))
1942 for c in encodedresult)
1943 self.assertEqual(decodedresult, s,
1944 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001945
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001946 @support.cpython_only
1947 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001948 s = "abc123" # all codecs should be able to encode these
1949 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001950 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001951 # check incremental decoder/encoder (fetched via the C API)
1952 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001953 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001954 except LookupError: # no IncrementalEncoder
1955 pass
1956 else:
1957 # check C API
1958 encodedresult = b""
1959 for c in s:
1960 encodedresult += cencoder.encode(c)
1961 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001962 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001963 decodedresult = ""
1964 for c in encodedresult:
1965 decodedresult += cdecoder.decode(bytes([c]))
1966 decodedresult += cdecoder.decode(b"", True)
1967 self.assertEqual(decodedresult, s,
1968 "encoding=%r" % encoding)
1969
1970 if encoding not in ("idna", "mbcs"):
1971 # check incremental decoder/encoder with errors argument
1972 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001973 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001974 except LookupError: # no IncrementalEncoder
1975 pass
1976 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001977 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001978 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001979 decodedresult = "".join(cdecoder.decode(bytes([c]))
1980 for c in encodedresult)
1981 self.assertEqual(decodedresult, s,
1982 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001983
Walter Dörwald729c31f2005-03-14 19:06:30 +00001984 def test_seek(self):
1985 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001986 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001987 for encoding in all_unicode_encodings:
1988 if encoding == "idna": # FIXME: See SF bug #1163178
1989 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001990 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001991 continue
Victor Stinner05010702011-05-27 16:50:40 +02001992 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001993 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001994 # Test that calling seek resets the internal codec state and buffers
1995 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001996 data = reader.read()
1997 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001998
Walter Dörwalde22d3392005-11-17 08:52:34 +00001999 def test_bad_decode_args(self):
2000 for encoding in all_unicode_encodings:
2001 decoder = codecs.getdecoder(encoding)
2002 self.assertRaises(TypeError, decoder)
2003 if encoding not in ("idna", "punycode"):
2004 self.assertRaises(TypeError, decoder, 42)
2005
2006 def test_bad_encode_args(self):
2007 for encoding in all_unicode_encodings:
2008 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002009 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002010
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002011 def test_encoding_map_type_initialized(self):
2012 from encodings import cp1140
2013 # This used to crash, we are only verifying there's no crash.
2014 table_type = type(cp1140.encoding_table)
2015 self.assertEqual(table_type, table_type)
2016
Walter Dörwald3abcb012007-04-16 22:10:50 +00002017 def test_decoder_state(self):
2018 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002019 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002020 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002021 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002022 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2023 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2024
Victor Stinnerf96418d2015-09-21 23:06:27 +02002025
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002026class CharmapTest(unittest.TestCase):
2027 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002028 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002029 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002030 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002031 )
2032
Ezio Melottib3aedd42010-11-20 19:04:17 +00002033 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002034 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2035 ("\U0010FFFFbc", 3)
2036 )
2037
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002038 self.assertRaises(UnicodeDecodeError,
2039 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2040 )
2041
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002042 self.assertRaises(UnicodeDecodeError,
2043 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2044 )
2045
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002046 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002047 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002048 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002049 )
2050
Ezio Melottib3aedd42010-11-20 19:04:17 +00002051 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002052 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002053 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002054 )
2055
Ezio Melottib3aedd42010-11-20 19:04:17 +00002056 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002057 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2058 ("ab\\x02", 3)
2059 )
2060
2061 self.assertEqual(
2062 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2063 ("ab\\x02", 3)
2064 )
2065
2066 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002067 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002068 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002069 )
2070
Ezio Melottib3aedd42010-11-20 19:04:17 +00002071 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002072 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002073 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002074 )
2075
Guido van Rossum805365e2007-05-07 22:24:25 +00002076 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002077 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002078 codecs.charmap_decode(allbytes, "ignore", ""),
2079 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002080 )
2081
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002082 def test_decode_with_int2str_map(self):
2083 self.assertEqual(
2084 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2085 {0: 'a', 1: 'b', 2: 'c'}),
2086 ("abc", 3)
2087 )
2088
2089 self.assertEqual(
2090 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2091 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2092 ("AaBbCc", 3)
2093 )
2094
2095 self.assertEqual(
2096 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2097 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2098 ("\U0010FFFFbc", 3)
2099 )
2100
2101 self.assertEqual(
2102 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2103 {0: 'a', 1: 'b', 2: ''}),
2104 ("ab", 3)
2105 )
2106
2107 self.assertRaises(UnicodeDecodeError,
2108 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2109 {0: 'a', 1: 'b'}
2110 )
2111
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002112 self.assertRaises(UnicodeDecodeError,
2113 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2114 {0: 'a', 1: 'b', 2: None}
2115 )
2116
2117 # Issue #14850
2118 self.assertRaises(UnicodeDecodeError,
2119 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2120 {0: 'a', 1: 'b', 2: '\ufffe'}
2121 )
2122
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002123 self.assertEqual(
2124 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2125 {0: 'a', 1: 'b'}),
2126 ("ab\ufffd", 3)
2127 )
2128
2129 self.assertEqual(
2130 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2131 {0: 'a', 1: 'b', 2: None}),
2132 ("ab\ufffd", 3)
2133 )
2134
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002135 # Issue #14850
2136 self.assertEqual(
2137 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2138 {0: 'a', 1: 'b', 2: '\ufffe'}),
2139 ("ab\ufffd", 3)
2140 )
2141
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002142 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002143 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2144 {0: 'a', 1: 'b'}),
2145 ("ab\\x02", 3)
2146 )
2147
2148 self.assertEqual(
2149 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2150 {0: 'a', 1: 'b', 2: None}),
2151 ("ab\\x02", 3)
2152 )
2153
2154 # Issue #14850
2155 self.assertEqual(
2156 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2157 {0: 'a', 1: 'b', 2: '\ufffe'}),
2158 ("ab\\x02", 3)
2159 )
2160
2161 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002162 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2163 {0: 'a', 1: 'b'}),
2164 ("ab", 3)
2165 )
2166
2167 self.assertEqual(
2168 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2169 {0: 'a', 1: 'b', 2: None}),
2170 ("ab", 3)
2171 )
2172
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002173 # Issue #14850
2174 self.assertEqual(
2175 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2176 {0: 'a', 1: 'b', 2: '\ufffe'}),
2177 ("ab", 3)
2178 )
2179
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002180 allbytes = bytes(range(256))
2181 self.assertEqual(
2182 codecs.charmap_decode(allbytes, "ignore", {}),
2183 ("", len(allbytes))
2184 )
2185
2186 def test_decode_with_int2int_map(self):
2187 a = ord('a')
2188 b = ord('b')
2189 c = ord('c')
2190
2191 self.assertEqual(
2192 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2193 {0: a, 1: b, 2: c}),
2194 ("abc", 3)
2195 )
2196
2197 # Issue #15379
2198 self.assertEqual(
2199 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2200 {0: 0x10FFFF, 1: b, 2: c}),
2201 ("\U0010FFFFbc", 3)
2202 )
2203
Antoine Pitroua1f76552012-09-23 20:00:04 +02002204 self.assertEqual(
2205 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2206 {0: sys.maxunicode, 1: b, 2: c}),
2207 (chr(sys.maxunicode) + "bc", 3)
2208 )
2209
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002210 self.assertRaises(TypeError,
2211 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002212 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002213 )
2214
2215 self.assertRaises(UnicodeDecodeError,
2216 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2217 {0: a, 1: b},
2218 )
2219
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002220 self.assertRaises(UnicodeDecodeError,
2221 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2222 {0: a, 1: b, 2: 0xFFFE},
2223 )
2224
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002225 self.assertEqual(
2226 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2227 {0: a, 1: b}),
2228 ("ab\ufffd", 3)
2229 )
2230
2231 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002232 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2233 {0: a, 1: b, 2: 0xFFFE}),
2234 ("ab\ufffd", 3)
2235 )
2236
2237 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002238 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2239 {0: a, 1: b}),
2240 ("ab\\x02", 3)
2241 )
2242
2243 self.assertEqual(
2244 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2245 {0: a, 1: b, 2: 0xFFFE}),
2246 ("ab\\x02", 3)
2247 )
2248
2249 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002250 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2251 {0: a, 1: b}),
2252 ("ab", 3)
2253 )
2254
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002255 self.assertEqual(
2256 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2257 {0: a, 1: b, 2: 0xFFFE}),
2258 ("ab", 3)
2259 )
2260
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002261
Thomas Wouters89f507f2006-12-13 04:49:30 +00002262class WithStmtTest(unittest.TestCase):
2263 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002264 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002265 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2266 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002267 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002268
2269 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002270 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002271 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002272 with codecs.StreamReaderWriter(f, info.streamreader,
2273 info.streamwriter, 'strict') as srw:
2274 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002275
Victor Stinnerf96418d2015-09-21 23:06:27 +02002276
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002277class TypesTest(unittest.TestCase):
2278 def test_decode_unicode(self):
2279 # Most decoders don't accept unicode input
2280 decoders = [
2281 codecs.utf_7_decode,
2282 codecs.utf_8_decode,
2283 codecs.utf_16_le_decode,
2284 codecs.utf_16_be_decode,
2285 codecs.utf_16_ex_decode,
2286 codecs.utf_32_decode,
2287 codecs.utf_32_le_decode,
2288 codecs.utf_32_be_decode,
2289 codecs.utf_32_ex_decode,
2290 codecs.latin_1_decode,
2291 codecs.ascii_decode,
2292 codecs.charmap_decode,
2293 ]
2294 if hasattr(codecs, "mbcs_decode"):
2295 decoders.append(codecs.mbcs_decode)
2296 for decoder in decoders:
2297 self.assertRaises(TypeError, decoder, "xxx")
2298
2299 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002300 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002301 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002302 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2303 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2304 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2305 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002306
Victor Stinnere3b47152011-12-09 20:49:49 +01002307 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2308 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002309 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2310 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002311
2312 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2313 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002314 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2315 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002316
Serhiy Storchakad6793772013-01-29 10:20:44 +02002317
2318class UnicodeEscapeTest(unittest.TestCase):
2319 def test_empty(self):
2320 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2321 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2322
2323 def test_raw_encode(self):
2324 encode = codecs.unicode_escape_encode
2325 for b in range(32, 127):
2326 if b != b'\\'[0]:
2327 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2328
2329 def test_raw_decode(self):
2330 decode = codecs.unicode_escape_decode
2331 for b in range(256):
2332 if b != b'\\'[0]:
2333 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2334
2335 def test_escape_encode(self):
2336 encode = codecs.unicode_escape_encode
2337 check = coding_checker(self, encode)
2338 check('\t', br'\t')
2339 check('\n', br'\n')
2340 check('\r', br'\r')
2341 check('\\', br'\\')
2342 for b in range(32):
2343 if chr(b) not in '\t\n\r':
2344 check(chr(b), ('\\x%02x' % b).encode())
2345 for b in range(127, 256):
2346 check(chr(b), ('\\x%02x' % b).encode())
2347 check('\u20ac', br'\u20ac')
2348 check('\U0001d120', br'\U0001d120')
2349
2350 def test_escape_decode(self):
2351 decode = codecs.unicode_escape_decode
2352 check = coding_checker(self, decode)
2353 check(b"[\\\n]", "[]")
2354 check(br'[\"]', '["]')
2355 check(br"[\']", "[']")
2356 check(br"[\\]", r"[\]")
2357 check(br"[\a]", "[\x07]")
2358 check(br"[\b]", "[\x08]")
2359 check(br"[\t]", "[\x09]")
2360 check(br"[\n]", "[\x0a]")
2361 check(br"[\v]", "[\x0b]")
2362 check(br"[\f]", "[\x0c]")
2363 check(br"[\r]", "[\x0d]")
2364 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002365 check(br"[\78]", "[\x078]")
2366 check(br"[\41]", "[!]")
2367 check(br"[\418]", "[!8]")
2368 check(br"[\101]", "[A]")
2369 check(br"[\1010]", "[A0]")
2370 check(br"[\x41]", "[A]")
2371 check(br"[\x410]", "[A0]")
2372 check(br"\u20ac", "\u20ac")
2373 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002374 for i in range(97, 123):
2375 b = bytes([i])
2376 if b not in b'abfnrtuvx':
2377 with self.assertWarns(DeprecationWarning):
2378 check(b"\\" + b, "\\" + chr(i))
2379 if b.upper() not in b'UN':
2380 with self.assertWarns(DeprecationWarning):
2381 check(b"\\" + b.upper(), "\\" + chr(i-32))
2382 with self.assertWarns(DeprecationWarning):
2383 check(br"\8", "\\8")
2384 with self.assertWarns(DeprecationWarning):
2385 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002386 with self.assertWarns(DeprecationWarning):
2387 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002388
2389 def test_decode_errors(self):
2390 decode = codecs.unicode_escape_decode
2391 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2392 for i in range(d):
2393 self.assertRaises(UnicodeDecodeError, decode,
2394 b"\\" + c + b"0"*i)
2395 self.assertRaises(UnicodeDecodeError, decode,
2396 b"[\\" + c + b"0"*i + b"]")
2397 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2398 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2399 self.assertEqual(decode(data, "replace"),
2400 ("[\ufffd]\ufffd", len(data)))
2401 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2402 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2403 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2404
2405
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002406class RawUnicodeEscapeTest(unittest.TestCase):
2407 def test_empty(self):
2408 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2409 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2410
2411 def test_raw_encode(self):
2412 encode = codecs.raw_unicode_escape_encode
2413 for b in range(256):
2414 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2415
2416 def test_raw_decode(self):
2417 decode = codecs.raw_unicode_escape_decode
2418 for b in range(256):
2419 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2420
2421 def test_escape_encode(self):
2422 encode = codecs.raw_unicode_escape_encode
2423 check = coding_checker(self, encode)
2424 for b in range(256):
2425 if b not in b'uU':
2426 check('\\' + chr(b), b'\\' + bytes([b]))
2427 check('\u20ac', br'\u20ac')
2428 check('\U0001d120', br'\U0001d120')
2429
2430 def test_escape_decode(self):
2431 decode = codecs.raw_unicode_escape_decode
2432 check = coding_checker(self, decode)
2433 for b in range(256):
2434 if b not in b'uU':
2435 check(b'\\' + bytes([b]), '\\' + chr(b))
2436 check(br"\u20ac", "\u20ac")
2437 check(br"\U0001d120", "\U0001d120")
2438
2439 def test_decode_errors(self):
2440 decode = codecs.raw_unicode_escape_decode
2441 for c, d in (b'u', 4), (b'U', 4):
2442 for i in range(d):
2443 self.assertRaises(UnicodeDecodeError, decode,
2444 b"\\" + c + b"0"*i)
2445 self.assertRaises(UnicodeDecodeError, decode,
2446 b"[\\" + c + b"0"*i + b"]")
2447 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2448 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2449 self.assertEqual(decode(data, "replace"),
2450 ("[\ufffd]\ufffd", len(data)))
2451 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2452 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2453 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2454
2455
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002456class EscapeEncodeTest(unittest.TestCase):
2457
2458 def test_escape_encode(self):
2459 tests = [
2460 (b'', (b'', 0)),
2461 (b'foobar', (b'foobar', 6)),
2462 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2463 (b'a\'b', (b"a\\'b", 3)),
2464 (b'b\\c', (b'b\\\\c', 3)),
2465 (b'c\nd', (b'c\\nd', 3)),
2466 (b'd\re', (b'd\\re', 3)),
2467 (b'f\x7fg', (b'f\\x7fg', 3)),
2468 ]
2469 for data, output in tests:
2470 with self.subTest(data=data):
2471 self.assertEqual(codecs.escape_encode(data), output)
2472 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2473 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2474
2475
Martin v. Löwis43c57782009-05-10 08:15:24 +00002476class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002477
2478 def test_utf8(self):
2479 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002480 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002481 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002482 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002483 b"foo\x80bar")
2484 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002485 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002486 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002487 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002488 b"\xed\xb0\x80")
2489
2490 def test_ascii(self):
2491 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002492 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002493 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002494 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002495 b"foo\x80bar")
2496
2497 def test_charmap(self):
2498 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002499 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002500 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002501 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002502 b"foo\xa5bar")
2503
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002504 def test_latin1(self):
2505 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002506 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002507 b"\xe4\xeb\xef\xf6\xfc")
2508
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002509
Victor Stinner3fed0872010-05-22 02:16:27 +00002510class BomTest(unittest.TestCase):
2511 def test_seek0(self):
2512 data = "1234567890"
2513 tests = ("utf-16",
2514 "utf-16-le",
2515 "utf-16-be",
2516 "utf-32",
2517 "utf-32-le",
2518 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002519 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002520 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002521 # Check if the BOM is written only once
2522 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002523 f.write(data)
2524 f.write(data)
2525 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002526 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002527 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002528 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002529
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002530 # Check that the BOM is written after a seek(0)
2531 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2532 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002533 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002534 f.seek(0)
2535 f.write(data)
2536 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002537 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002538
2539 # (StreamWriter) Check that the BOM is written after a seek(0)
2540 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002541 f.writer.write(data[0])
2542 self.assertNotEqual(f.writer.tell(), 0)
2543 f.writer.seek(0)
2544 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002545 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002546 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002547
Victor Stinner05010702011-05-27 16:50:40 +02002548 # Check that the BOM is not written after a seek() at a position
2549 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002550 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2551 f.write(data)
2552 f.seek(f.tell())
2553 f.write(data)
2554 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002555 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002556
Victor Stinner05010702011-05-27 16:50:40 +02002557 # (StreamWriter) Check that the BOM is not written after a seek()
2558 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002559 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002560 f.writer.write(data)
2561 f.writer.seek(f.writer.tell())
2562 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002563 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002564 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002565
Victor Stinner3fed0872010-05-22 02:16:27 +00002566
Georg Brandl02524622010-12-02 18:06:51 +00002567bytes_transform_encodings = [
2568 "base64_codec",
2569 "uu_codec",
2570 "quopri_codec",
2571 "hex_codec",
2572]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002573
2574transform_aliases = {
2575 "base64_codec": ["base64", "base_64"],
2576 "uu_codec": ["uu"],
2577 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2578 "hex_codec": ["hex"],
2579 "rot_13": ["rot13"],
2580}
2581
Georg Brandl02524622010-12-02 18:06:51 +00002582try:
2583 import zlib
2584except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002585 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002586else:
2587 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002588 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002589try:
2590 import bz2
2591except ImportError:
2592 pass
2593else:
2594 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002595 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002596
Victor Stinnerf96418d2015-09-21 23:06:27 +02002597
Georg Brandl02524622010-12-02 18:06:51 +00002598class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002599
Georg Brandl02524622010-12-02 18:06:51 +00002600 def test_basics(self):
2601 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002602 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002603 with self.subTest(encoding=encoding):
2604 # generic codecs interface
2605 (o, size) = codecs.getencoder(encoding)(binput)
2606 self.assertEqual(size, len(binput))
2607 (i, size) = codecs.getdecoder(encoding)(o)
2608 self.assertEqual(size, len(o))
2609 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002610
Georg Brandl02524622010-12-02 18:06:51 +00002611 def test_read(self):
2612 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002613 with self.subTest(encoding=encoding):
2614 sin = codecs.encode(b"\x80", encoding)
2615 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2616 sout = reader.read()
2617 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002618
2619 def test_readline(self):
2620 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002621 with self.subTest(encoding=encoding):
2622 sin = codecs.encode(b"\x80", encoding)
2623 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2624 sout = reader.readline()
2625 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002626
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002627 def test_buffer_api_usage(self):
2628 # We check all the transform codecs accept memoryview input
2629 # for encoding and decoding
2630 # and also that they roundtrip correctly
2631 original = b"12345\x80"
2632 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 with self.subTest(encoding=encoding):
2634 data = original
2635 view = memoryview(data)
2636 data = codecs.encode(data, encoding)
2637 view_encoded = codecs.encode(view, encoding)
2638 self.assertEqual(view_encoded, data)
2639 view = memoryview(data)
2640 data = codecs.decode(data, encoding)
2641 self.assertEqual(data, original)
2642 view_decoded = codecs.decode(view, encoding)
2643 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002644
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002646 # Check binary -> binary codecs give a good error for str input
2647 bad_input = "bad input type"
2648 for encoding in bytes_transform_encodings:
2649 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002650 fmt = (r"{!r} is not a text encoding; "
2651 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002652 msg = fmt.format(encoding)
2653 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002654 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002655 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002656
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002657 def test_text_to_binary_blacklists_text_transforms(self):
2658 # Check str.encode gives a good error message for str -> str codecs
2659 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002660 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002661 with self.assertRaisesRegex(LookupError, msg):
2662 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002663
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002664 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002665 # Check bytes.decode and bytearray.decode give a good error
2666 # message for binary -> binary codecs
2667 data = b"encode first to ensure we meet any format restrictions"
2668 for encoding in bytes_transform_encodings:
2669 with self.subTest(encoding=encoding):
2670 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002671 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002672 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002673 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002674 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002675 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677 bytearray(encoded_data).decode(encoding)
2678
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002679 def test_binary_to_text_blacklists_text_transforms(self):
2680 # Check str -> str codec gives a good error for binary input
2681 for bad_input in (b"immutable", bytearray(b"mutable")):
2682 with self.subTest(bad_input=bad_input):
2683 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002684 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002685 with self.assertRaisesRegex(LookupError, msg) as failure:
2686 bad_input.decode("rot_13")
2687 self.assertIsNone(failure.exception.__cause__)
2688
Zachary Wareefa2e042013-12-30 14:54:11 -06002689 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002690 def test_custom_zlib_error_is_wrapped(self):
2691 # Check zlib codec gives a good error for malformed input
2692 msg = "^decoding with 'zlib_codec' codec failed"
2693 with self.assertRaisesRegex(Exception, msg) as failure:
2694 codecs.decode(b"hello", "zlib_codec")
2695 self.assertIsInstance(failure.exception.__cause__,
2696 type(failure.exception))
2697
2698 def test_custom_hex_error_is_wrapped(self):
2699 # Check hex codec gives a good error for malformed input
2700 msg = "^decoding with 'hex_codec' codec failed"
2701 with self.assertRaisesRegex(Exception, msg) as failure:
2702 codecs.decode(b"hello", "hex_codec")
2703 self.assertIsInstance(failure.exception.__cause__,
2704 type(failure.exception))
2705
2706 # Unfortunately, the bz2 module throws OSError, which the codec
2707 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002708
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002709 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2710 def test_aliases(self):
2711 for codec_name, aliases in transform_aliases.items():
2712 expected_name = codecs.lookup(codec_name).name
2713 for alias in aliases:
2714 with self.subTest(alias=alias):
2715 info = codecs.lookup(alias)
2716 self.assertEqual(info.name, expected_name)
2717
Martin Panter06171bd2015-09-12 00:34:28 +00002718 def test_quopri_stateless(self):
2719 # Should encode with quotetabs=True
2720 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2721 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2722 # But should still support unescaped tabs and spaces
2723 unescaped = b"space tab eol\n"
2724 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2725
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002726 def test_uu_invalid(self):
2727 # Missing "begin" line
2728 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2729
Nick Coghlan8b097b42013-11-13 23:49:21 +10002730
2731# The codec system tries to wrap exceptions in order to ensure the error
2732# mentions the operation being performed and the codec involved. We
2733# currently *only* want this to happen for relatively stateless
2734# exceptions, where the only significant information they contain is their
2735# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002736
2737# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002738# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002739_TEST_CODECS = {}
2740
2741def _get_test_codec(codec_name):
2742 return _TEST_CODECS.get(codec_name)
2743codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2744
Nick Coghlan8fad1672014-09-15 23:50:44 +12002745try:
2746 # Issue #22166: Also need to clear the internal cache in CPython
2747 from _codecs import _forget_codec
2748except ImportError:
2749 def _forget_codec(codec_name):
2750 pass
2751
2752
Nick Coghlan8b097b42013-11-13 23:49:21 +10002753class ExceptionChainingTest(unittest.TestCase):
2754
2755 def setUp(self):
2756 # There's no way to unregister a codec search function, so we just
2757 # ensure we render this one fairly harmless after the test
2758 # case finishes by using the test case repr as the codec name
2759 # The codecs module normalizes codec names, although this doesn't
2760 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002761 # We also make sure we use a truly unique id for the custom codec
2762 # to avoid issues with the codec cache when running these tests
2763 # multiple times (e.g. when hunting for refleaks)
2764 unique_id = repr(self) + str(id(self))
2765 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2766
2767 # We store the object to raise on the instance because of a bad
2768 # interaction between the codec caching (which means we can't
2769 # recreate the codec entry) and regrtest refleak hunting (which
2770 # runs the same test instance multiple times). This means we
2771 # need to ensure the codecs call back in to the instance to find
2772 # out which exception to raise rather than binding them in a
2773 # closure to an object that may change on the next run
2774 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002775
Nick Coghlan4e553e22013-11-16 00:35:34 +10002776 def tearDown(self):
2777 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002778 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2779 encodings._cache.pop(self.codec_name, None)
2780 try:
2781 _forget_codec(self.codec_name)
2782 except KeyError:
2783 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002784
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002785 def set_codec(self, encode, decode):
2786 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002787 name=self.codec_name)
2788 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002789
2790 @contextlib.contextmanager
2791 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002792 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002793 operation, self.codec_name, exc_type.__name__, msg)
2794 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2795 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002796 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002797 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002798
2799 def raise_obj(self, *args, **kwds):
2800 # Helper to dynamically change the object raised by a test codec
2801 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002802
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002803 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002804 self.obj_to_raise = obj_to_raise
2805 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002806 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002807 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002808 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002809 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002810 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002811 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002812 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002813 codecs.decode(b"bytes input", self.codec_name)
2814
2815 def test_raise_by_type(self):
2816 self.check_wrapped(RuntimeError, "")
2817
2818 def test_raise_by_value(self):
2819 msg = "This should be wrapped"
2820 self.check_wrapped(RuntimeError(msg), msg)
2821
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002822 def test_raise_grandchild_subclass_exact_size(self):
2823 msg = "This should be wrapped"
2824 class MyRuntimeError(RuntimeError):
2825 __slots__ = ()
2826 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2827
2828 def test_raise_subclass_with_weakref_support(self):
2829 msg = "This should be wrapped"
2830 class MyRuntimeError(RuntimeError):
2831 pass
2832 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2833
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002834 def check_not_wrapped(self, obj_to_raise, msg):
2835 def raise_obj(*args, **kwds):
2836 raise obj_to_raise
2837 self.set_codec(raise_obj, raise_obj)
2838 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002839 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002840 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002841 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002842 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002843 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002844 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002845 codecs.decode(b"bytes input", self.codec_name)
2846
2847 def test_init_override_is_not_wrapped(self):
2848 class CustomInit(RuntimeError):
2849 def __init__(self):
2850 pass
2851 self.check_not_wrapped(CustomInit, "")
2852
2853 def test_new_override_is_not_wrapped(self):
2854 class CustomNew(RuntimeError):
2855 def __new__(cls):
2856 return super().__new__(cls)
2857 self.check_not_wrapped(CustomNew, "")
2858
2859 def test_instance_attribute_is_not_wrapped(self):
2860 msg = "This should NOT be wrapped"
2861 exc = RuntimeError(msg)
2862 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002863 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002864
2865 def test_non_str_arg_is_not_wrapped(self):
2866 self.check_not_wrapped(RuntimeError(1), "1")
2867
2868 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002869 msg_re = r"^\('a', 'b', 'c'\)$"
2870 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002871
2872 # http://bugs.python.org/issue19609
2873 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002874 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002875 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002876 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002877 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002878 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002879 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002880 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002881 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002882 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002883 codecs.decode(b"bytes input", self.codec_name)
2884
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002885 def test_unflagged_non_text_codec_handling(self):
2886 # The stdlib non-text codecs are now marked so they're
2887 # pre-emptively skipped by the text model related methods
2888 # However, third party codecs won't be flagged, so we still make
2889 # sure the case where an inappropriate output type is produced is
2890 # handled appropriately
2891 def encode_to_str(*args, **kwds):
2892 return "not bytes!", 0
2893 def decode_to_bytes(*args, **kwds):
2894 return b"not str!", 0
2895 self.set_codec(encode_to_str, decode_to_bytes)
2896 # No input or output type checks on the codecs module functions
2897 encoded = codecs.encode(None, self.codec_name)
2898 self.assertEqual(encoded, "not bytes!")
2899 decoded = codecs.decode(None, self.codec_name)
2900 self.assertEqual(decoded, b"not str!")
2901 # Text model methods should complain
2902 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002903 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002904 msg = fmt.format(self.codec_name)
2905 with self.assertRaisesRegex(TypeError, msg):
2906 "str_input".encode(self.codec_name)
2907 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002908 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002909 msg = fmt.format(self.codec_name)
2910 with self.assertRaisesRegex(TypeError, msg):
2911 b"bytes input".decode(self.codec_name)
2912
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002913
Georg Brandl02524622010-12-02 18:06:51 +00002914
Victor Stinner62be4fb2011-10-18 21:46:37 +02002915@unittest.skipUnless(sys.platform == 'win32',
2916 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002917class CodePageTest(unittest.TestCase):
2918 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002919
Victor Stinner3a50e702011-10-18 21:21:00 +02002920 def test_invalid_code_page(self):
2921 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2922 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002923 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2924 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002925
2926 def test_code_page_name(self):
2927 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2928 codecs.code_page_encode, 932, '\xff')
2929 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002930 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002931 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002932 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002933
2934 def check_decode(self, cp, tests):
2935 for raw, errors, expected in tests:
2936 if expected is not None:
2937 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002938 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002939 except UnicodeDecodeError as err:
2940 self.fail('Unable to decode %a from "cp%s" with '
2941 'errors=%r: %s' % (raw, cp, errors, err))
2942 self.assertEqual(decoded[0], expected,
2943 '%a.decode("cp%s", %r)=%a != %a'
2944 % (raw, cp, errors, decoded[0], expected))
2945 # assert 0 <= decoded[1] <= len(raw)
2946 self.assertGreaterEqual(decoded[1], 0)
2947 self.assertLessEqual(decoded[1], len(raw))
2948 else:
2949 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002950 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002951
2952 def check_encode(self, cp, tests):
2953 for text, errors, expected in tests:
2954 if expected is not None:
2955 try:
2956 encoded = codecs.code_page_encode(cp, text, errors)
2957 except UnicodeEncodeError as err:
2958 self.fail('Unable to encode %a to "cp%s" with '
2959 'errors=%r: %s' % (text, cp, errors, err))
2960 self.assertEqual(encoded[0], expected,
2961 '%a.encode("cp%s", %r)=%a != %a'
2962 % (text, cp, errors, encoded[0], expected))
2963 self.assertEqual(encoded[1], len(text))
2964 else:
2965 self.assertRaises(UnicodeEncodeError,
2966 codecs.code_page_encode, cp, text, errors)
2967
2968 def test_cp932(self):
2969 self.check_encode(932, (
2970 ('abc', 'strict', b'abc'),
2971 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002972 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002973 ('\xff', 'strict', None),
2974 ('[\xff]', 'ignore', b'[]'),
2975 ('[\xff]', 'replace', b'[y]'),
2976 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002977 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002978 ('[\xff]', 'namereplace',
2979 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002980 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002981 ('\udcff', 'strict', None),
2982 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2983 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002984 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002985 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002986 (b'abc', 'strict', 'abc'),
2987 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2988 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002989 (b'[\xff]', 'strict', None),
2990 (b'[\xff]', 'ignore', '[]'),
2991 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002992 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002993 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002994 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002995 (b'\x81\x00abc', 'strict', None),
2996 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002997 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002998 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002999 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003000
3001 def test_cp1252(self):
3002 self.check_encode(1252, (
3003 ('abc', 'strict', b'abc'),
3004 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3005 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003006 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003007 ('\u0141', 'strict', None),
3008 ('\u0141', 'ignore', b''),
3009 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003010 ('\udc98', 'surrogateescape', b'\x98'),
3011 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003012 ))
3013 self.check_decode(1252, (
3014 (b'abc', 'strict', 'abc'),
3015 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3016 (b'\xff', 'strict', '\xff'),
3017 ))
3018
3019 def test_cp_utf7(self):
3020 cp = 65000
3021 self.check_encode(cp, (
3022 ('abc', 'strict', b'abc'),
3023 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3024 ('\U0010ffff', 'strict', b'+2//f/w-'),
3025 ('\udc80', 'strict', b'+3IA-'),
3026 ('\ufffd', 'strict', b'+//0-'),
3027 ))
3028 self.check_decode(cp, (
3029 (b'abc', 'strict', 'abc'),
3030 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3031 (b'+2//f/w-', 'strict', '\U0010ffff'),
3032 (b'+3IA-', 'strict', '\udc80'),
3033 (b'+//0-', 'strict', '\ufffd'),
3034 # invalid bytes
3035 (b'[+/]', 'strict', '[]'),
3036 (b'[\xff]', 'strict', '[\xff]'),
3037 ))
3038
Victor Stinner3a50e702011-10-18 21:21:00 +02003039 def test_multibyte_encoding(self):
3040 self.check_decode(932, (
3041 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3042 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3043 ))
3044 self.check_decode(self.CP_UTF8, (
3045 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3046 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3047 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003048 self.check_encode(self.CP_UTF8, (
3049 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3050 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3051 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003052
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003053 def test_code_page_decode_flags(self):
3054 # Issue #36312: For some code pages (e.g. UTF-7) flags for
3055 # MultiByteToWideChar() must be set to 0.
Paul Monson62dfd7d2019-04-25 11:36:45 -07003056 if support.verbose:
3057 sys.stdout.write('\n')
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003058 for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3059 *range(57002, 57011+1), 65000):
Paul Monson62dfd7d2019-04-25 11:36:45 -07003060 # On small versions of Windows like Windows IoT
3061 # not all codepages are present.
3062 # A missing codepage causes an OSError exception
3063 # so check for the codepage before decoding
3064 if is_code_page_present(cp):
3065 self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3066 else:
3067 if support.verbose:
3068 print(f" skipping cp={cp}")
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02003069 self.assertEqual(codecs.code_page_decode(42, b'abc'),
3070 ('\uf061\uf062\uf063', 3))
3071
Victor Stinner3a50e702011-10-18 21:21:00 +02003072 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003073 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3074 self.assertEqual(decoded, ('', 0))
3075
Victor Stinner3a50e702011-10-18 21:21:00 +02003076 decoded = codecs.code_page_decode(932,
3077 b'\xe9\x80\xe9', 'strict',
3078 False)
3079 self.assertEqual(decoded, ('\u9a3e', 2))
3080
3081 decoded = codecs.code_page_decode(932,
3082 b'\xe9\x80\xe9\x80', 'strict',
3083 False)
3084 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3085
3086 decoded = codecs.code_page_decode(932,
3087 b'abc', 'strict',
3088 False)
3089 self.assertEqual(decoded, ('abc', 3))
3090
Steve Dowerf5aba582016-09-06 19:42:27 -07003091 def test_mbcs_alias(self):
3092 # Check that looking up our 'default' codepage will return
3093 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003094 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003095 codec = codecs.lookup('cp123')
3096 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003097
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003098 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
Steve Dower7ebdda02019-08-21 16:22:33 -07003099 def test_large_input(self, size):
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003100 # Test input longer than INT_MAX.
3101 # Input should contain undecodable bytes before and after
3102 # the INT_MAX limit.
Steve Dower7ebdda02019-08-21 16:22:33 -07003103 encoded = (b'01234567' * ((size//8)-1) +
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003104 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
Steve Dower7ebdda02019-08-21 16:22:33 -07003105 self.assertEqual(len(encoded), size+2)
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003106 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3107 self.assertEqual(decoded[1], len(encoded))
3108 del encoded
3109 self.assertEqual(len(decoded[0]), decoded[1])
3110 self.assertEqual(decoded[0][:10], '0123456701')
3111 self.assertEqual(decoded[0][-20:],
3112 '6701234567'
3113 '\udc85\udc86\udcea\udceb\udcec'
3114 '\udcef\udcfc\udcfd\udcfe\udcff')
3115
Steve Dower7ebdda02019-08-21 16:22:33 -07003116 @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3117 def test_large_utf8_input(self, size):
3118 # Test input longer than INT_MAX.
3119 # Input should contain a decodable multi-byte character
3120 # surrounding INT_MAX
3121 encoded = (b'0123456\xed\x84\x80' * (size//8))
3122 self.assertEqual(len(encoded), size // 8 * 10)
3123 decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3124 self.assertEqual(decoded[1], len(encoded))
3125 del encoded
3126 self.assertEqual(len(decoded[0]), size)
3127 self.assertEqual(decoded[0][:10], '0123456\ud10001')
3128 self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3129
Victor Stinner3a50e702011-10-18 21:21:00 +02003130
Victor Stinnerf96418d2015-09-21 23:06:27 +02003131class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003132 def test_encode(self):
3133 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3134
3135 def test_encode_error(self):
3136 for data, error_handler, expected in (
3137 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3138 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3139 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003140 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3141 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003142 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3143 ):
3144 with self.subTest(data=data, error_handler=error_handler,
3145 expected=expected):
3146 self.assertEqual(data.encode('ascii', error_handler),
3147 expected)
3148
3149 def test_encode_surrogateescape_error(self):
3150 with self.assertRaises(UnicodeEncodeError):
3151 # the first character can be decoded, but not the second
3152 '\udc80\xff'.encode('ascii', 'surrogateescape')
3153
Victor Stinnerf96418d2015-09-21 23:06:27 +02003154 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003155 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3156
3157 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003158 for data, error_handler, expected in (
3159 (b'[\x80\xff]', 'ignore', '[]'),
3160 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3161 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3162 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3163 ):
3164 with self.subTest(data=data, error_handler=error_handler,
3165 expected=expected):
3166 self.assertEqual(data.decode('ascii', error_handler),
3167 expected)
3168
3169
Victor Stinnerc3713e92015-09-29 12:32:13 +02003170class Latin1Test(unittest.TestCase):
3171 def test_encode(self):
3172 for data, expected in (
3173 ('abc', b'abc'),
3174 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3175 ):
3176 with self.subTest(data=data, expected=expected):
3177 self.assertEqual(data.encode('latin1'), expected)
3178
3179 def test_encode_errors(self):
3180 for data, error_handler, expected in (
3181 ('[\u20ac\udc80]', 'ignore', b'[]'),
3182 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003183 ('[\u20ac\U000abcde]', 'backslashreplace',
3184 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003185 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3186 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3187 ):
3188 with self.subTest(data=data, error_handler=error_handler,
3189 expected=expected):
3190 self.assertEqual(data.encode('latin1', error_handler),
3191 expected)
3192
3193 def test_encode_surrogateescape_error(self):
3194 with self.assertRaises(UnicodeEncodeError):
3195 # the first character can be decoded, but not the second
3196 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3197
3198 def test_decode(self):
3199 for data, expected in (
3200 (b'abc', 'abc'),
3201 (b'[\x80\xff]', '[\x80\xff]'),
3202 ):
3203 with self.subTest(data=data, expected=expected):
3204 self.assertEqual(data.decode('latin1'), expected)
3205
3206
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003207class StreamRecoderTest(unittest.TestCase):
3208 def test_writelines(self):
3209 bio = io.BytesIO()
3210 codec = codecs.lookup('ascii')
3211 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3212 encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3213 sr.writelines([b'a', b'b'])
3214 self.assertEqual(bio.getvalue(), b'ab')
3215
3216 def test_write(self):
3217 bio = io.BytesIO()
3218 codec = codecs.lookup('latin1')
3219 # Recode from Latin-1 to utf-8.
3220 sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3221 encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3222
3223 text = 'àñé'
3224 sr.write(text.encode('latin1'))
3225 self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3226
Ammar Askara6ec1ce2019-05-31 12:44:01 -07003227 def test_seeking_read(self):
3228 bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3229 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3230
3231 self.assertEqual(sr.readline(), b'line1\n')
3232 sr.seek(0)
3233 self.assertEqual(sr.readline(), b'line1\n')
3234 self.assertEqual(sr.readline(), b'line2\n')
3235 self.assertEqual(sr.readline(), b'line3\n')
3236 self.assertEqual(sr.readline(), b'')
3237
3238 def test_seeking_write(self):
3239 bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3240 sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3241
3242 # Test that seek() only resets its internal buffer when offset
3243 # and whence are zero.
3244 sr.seek(2)
3245 sr.write(b'\nabc\n')
3246 self.assertEqual(sr.readline(), b'789\n')
3247 sr.seek(0)
3248 self.assertEqual(sr.readline(), b'1\n')
3249 self.assertEqual(sr.readline(), b'abc\n')
3250 self.assertEqual(sr.readline(), b'789\n')
3251
Jelle Zijlstrab3be4072019-05-22 08:18:26 -07003252
Victor Stinner3d4226a2018-08-29 22:21:32 +02003253@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3254class LocaleCodecTest(unittest.TestCase):
3255 """
3256 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3257 """
3258 ENCODING = sys.getfilesystemencoding()
3259 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3260 "u255:\xff",
3261 "UCS:\xe9\u20ac\U0010ffff",
3262 "surrogates:\uDC80\uDCFF")
3263 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3264 SURROGATES = "\uDC80\uDCFF"
3265
3266 def encode(self, text, errors="strict"):
3267 return _testcapi.EncodeLocaleEx(text, 0, errors)
3268
3269 def check_encode_strings(self, errors):
3270 for text in self.STRINGS:
3271 with self.subTest(text=text):
3272 try:
3273 expected = text.encode(self.ENCODING, errors)
3274 except UnicodeEncodeError:
3275 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003276 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003277 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003278 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003279 else:
3280 encoded = self.encode(text, errors)
3281 self.assertEqual(encoded, expected)
3282
3283 def test_encode_strict(self):
3284 self.check_encode_strings("strict")
3285
3286 def test_encode_surrogateescape(self):
3287 self.check_encode_strings("surrogateescape")
3288
3289 def test_encode_surrogatepass(self):
3290 try:
3291 self.encode('', 'surrogatepass')
3292 except ValueError as exc:
3293 if str(exc) == 'unsupported error handler':
3294 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3295 f"surrogatepass error handler")
3296 else:
3297 raise
3298
3299 self.check_encode_strings("surrogatepass")
3300
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003301 def test_encode_unsupported_error_handler(self):
3302 with self.assertRaises(ValueError) as cm:
3303 self.encode('', 'backslashreplace')
3304 self.assertEqual(str(cm.exception), 'unsupported error handler')
3305
Victor Stinner3d4226a2018-08-29 22:21:32 +02003306 def decode(self, encoded, errors="strict"):
3307 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3308
3309 def check_decode_strings(self, errors):
3310 is_utf8 = (self.ENCODING == "utf-8")
3311 if is_utf8:
3312 encode_errors = 'surrogateescape'
3313 else:
3314 encode_errors = 'strict'
3315
3316 strings = list(self.BYTES_STRINGS)
3317 for text in self.STRINGS:
3318 try:
3319 encoded = text.encode(self.ENCODING, encode_errors)
3320 if encoded not in strings:
3321 strings.append(encoded)
3322 except UnicodeEncodeError:
3323 encoded = None
3324
3325 if is_utf8:
3326 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3327 if encoded2 != encoded:
3328 strings.append(encoded2)
3329
3330 for encoded in strings:
3331 with self.subTest(encoded=encoded):
3332 try:
3333 expected = encoded.decode(self.ENCODING, errors)
3334 except UnicodeDecodeError:
3335 with self.assertRaises(RuntimeError) as cm:
3336 self.decode(encoded, errors)
3337 errmsg = str(cm.exception)
3338 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3339 else:
3340 decoded = self.decode(encoded, errors)
3341 self.assertEqual(decoded, expected)
3342
3343 def test_decode_strict(self):
3344 self.check_decode_strings("strict")
3345
3346 def test_decode_surrogateescape(self):
3347 self.check_decode_strings("surrogateescape")
3348
3349 def test_decode_surrogatepass(self):
3350 try:
3351 self.decode(b'', 'surrogatepass')
3352 except ValueError as exc:
3353 if str(exc) == 'unsupported error handler':
3354 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3355 f"surrogatepass error handler")
3356 else:
3357 raise
3358
3359 self.check_decode_strings("surrogatepass")
3360
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003361 def test_decode_unsupported_error_handler(self):
3362 with self.assertRaises(ValueError) as cm:
3363 self.decode(b'', 'backslashreplace')
3364 self.assertEqual(str(cm.exception), 'unsupported error handler')
3365
Victor Stinner3d4226a2018-08-29 22:21:32 +02003366
Zethb3b48c82019-09-09 15:50:36 +01003367class Rot13Test(unittest.TestCase):
3368 """Test the educational ROT-13 codec."""
3369 def test_encode(self):
3370 ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3371 self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3372
3373 def test_decode(self):
3374 plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3375 self.assertEqual(plaintext, 'Et tu, Brute?')
3376
3377 def test_incremental_encode(self):
3378 encoder = codecs.getincrementalencoder('rot-13')()
3379 ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3380 self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3381
3382 def test_incremental_decode(self):
3383 decoder = codecs.getincrementaldecoder('rot-13')()
3384 plaintext = decoder.decode('terra Ares envy tha')
3385 self.assertEqual(plaintext, 'green Nerf rail gun')
3386
3387
3388class Rot13UtilTest(unittest.TestCase):
3389 """Test the ROT-13 codec via rot13 function,
3390 i.e. the user has done something like:
3391 $ echo "Hello World" | python -m encodings.rot_13
3392 """
3393 def test_rot13_func(self):
3394 infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3395 outfile = io.StringIO()
3396 encodings.rot_13.rot13(infile, outfile)
3397 outfile.seek(0)
3398 plain_text = outfile.read()
3399 self.assertEqual(
3400 plain_text,
3401 'To be, or not to be, that is the question')
3402
3403
Fred Drake2e2be372001-09-20 21:33:42 +00003404if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003405 unittest.main()