blob: e8c7d76544e18032660fd710610bc3dd63a3474a [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
14except ImportError as exc:
15 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Victor Stinnerf96418d2015-09-21 23:06:27 +020030
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Victor Stinnerf96418d2015-09-21 23:06:27 +020051
Walter Dörwald3abcb012007-04-16 22:10:50 +000052class MixInCheckStateHandling:
53 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000054 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 d = codecs.getincrementaldecoder(encoding)()
56 part1 = d.decode(s[:i])
57 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000058 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000059 # Check that the condition stated in the documentation for
60 # IncrementalDecoder.getstate() holds
61 if not state[1]:
62 # reset decoder to the default state without anything buffered
63 d.setstate((state[0][:0], 0))
64 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000065 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000066 # The decoder must return to the same state
67 self.assertEqual(state, d.getstate())
68 # Create a new decoder and set it to the state
69 # we extracted from the old one
70 d = codecs.getincrementaldecoder(encoding)()
71 d.setstate(state)
72 part2 = d.decode(s[i:], True)
73 self.assertEqual(u, part1+part2)
74
75 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000076 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000077 d = codecs.getincrementalencoder(encoding)()
78 part1 = d.encode(u[:i])
79 state = d.getstate()
80 d = codecs.getincrementalencoder(encoding)()
81 d.setstate(state)
82 part2 = d.encode(u[i:], True)
83 self.assertEqual(s, part1+part2)
84
Victor Stinnerf96418d2015-09-21 23:06:27 +020085
Ezio Melotti5d3dba02013-01-11 06:02:07 +020086class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000088 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000089 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000090 # the StreamReader and check that the results equal the appropriate
91 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020093 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000097 result += r.read()
98 self.assertEqual(result, partialresult)
99 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000102
Martin Panter7462b6492015-11-02 03:37:02 +0000103 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000105 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 self.assertEqual(result, partialresult)
109 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 self.assertEqual(d.decode(b"", True), "")
111 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000115 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000120 self.assertEqual(d.decode(b"", True), "")
121 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000122
123 # check iterdecode()
124 encoded = input.encode(self.encoding)
125 self.assertEqual(
126 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 )
129
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 def test_readline(self):
131 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000132 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 return codecs.getreader(self.encoding)(stream)
134
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200136 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 lines = []
138 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140 if not line:
141 break
142 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000144
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
146 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
147 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000148 self.assertEqual(readalllines(s, True), sexpected)
149 self.assertEqual(readalllines(s, False), sexpectednoends)
150 self.assertEqual(readalllines(s, True, 10), sexpected)
151 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200153 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000154 # Test long lines (multiple calls to read() in readline())
155 vw = []
156 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200157 for (i, lineend) in enumerate(lineends):
158 vw.append((i*200+200)*"\u3042" + lineend)
159 vwo.append((i*200+200)*"\u3042")
160 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
161 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000162
163 # Test lines where the first read might end with \r, so the
164 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200166 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000169 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 self.assertEqual(
171 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000172 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200174 self.assertEqual(
175 reader.readline(keepends=True),
176 "xxx\n",
177 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000179 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 self.assertEqual(
181 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000182 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200184 self.assertEqual(
185 reader.readline(keepends=False),
186 "xxx",
187 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200189 def test_mixed_readline_and_read(self):
190 lines = ["Humpty Dumpty sat on a wall,\n",
191 "Humpty Dumpty had a great fall.\r\n",
192 "All the king's horses and all the king's men\r",
193 "Couldn't put Humpty together again."]
194 data = ''.join(lines)
195 def getreader():
196 stream = io.BytesIO(data.encode(self.encoding))
197 return codecs.getreader(self.encoding)(stream)
198
199 # Issue #8260: Test readline() followed by read()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.read(), ''.join(lines[1:]))
203 self.assertEqual(f.read(), '')
204
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200205 # Issue #32110: Test readline() followed by read(n)
206 f = getreader()
207 self.assertEqual(f.readline(), lines[0])
208 self.assertEqual(f.read(1), lines[1][0])
209 self.assertEqual(f.read(0), '')
210 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
211
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200212 # Issue #16636: Test readline() followed by readlines()
213 f = getreader()
214 self.assertEqual(f.readline(), lines[0])
215 self.assertEqual(f.readlines(), lines[1:])
216 self.assertEqual(f.read(), '')
217
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200218 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200219 f = getreader()
220 self.assertEqual(f.read(size=40, chars=5), data[:5])
221 self.assertEqual(f.read(), data[5:])
222 self.assertEqual(f.read(), '')
223
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200224 # Issue #32110: Test read(n) followed by read(n)
225 f = getreader()
226 self.assertEqual(f.read(size=40, chars=5), data[:5])
227 self.assertEqual(f.read(1), data[5])
228 self.assertEqual(f.read(0), '')
229 self.assertEqual(f.read(100), data[6:106])
230
231 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 f = getreader()
233 self.assertEqual(f.read(size=40, chars=5), data[:5])
234 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
235 self.assertEqual(f.read(), '')
236
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 def test_bug1175396(self):
238 s = [
239 '<%!--===================================================\r\n',
240 ' BLOG index page: show recent articles,\r\n',
241 ' today\'s articles, or articles of a specific date.\r\n',
242 '========================================================--%>\r\n',
243 '<%@inputencoding="ISO-8859-1"%>\r\n',
244 '<%@pagetemplate=TEMPLATE.y%>\r\n',
245 '<%@import=import frog.util, frog%>\r\n',
246 '<%@import=import frog.objects%>\r\n',
247 '<%@import=from frog.storageerrors import StorageError%>\r\n',
248 '<%\r\n',
249 '\r\n',
250 'import logging\r\n',
251 'log=logging.getLogger("Snakelets.logger")\r\n',
252 '\r\n',
253 '\r\n',
254 'user=self.SessionCtx.user\r\n',
255 'storageEngine=self.SessionCtx.storageEngine\r\n',
256 '\r\n',
257 '\r\n',
258 'def readArticlesFromDate(date, count=None):\r\n',
259 ' entryids=storageEngine.listBlogEntries(date)\r\n',
260 ' entryids.reverse() # descending\r\n',
261 ' if count:\r\n',
262 ' entryids=entryids[:count]\r\n',
263 ' try:\r\n',
264 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
265 ' except StorageError,x:\r\n',
266 ' log.error("Error loading articles: "+str(x))\r\n',
267 ' self.abort("cannot load articles")\r\n',
268 '\r\n',
269 'showdate=None\r\n',
270 '\r\n',
271 'arg=self.Request.getArg()\r\n',
272 'if arg=="today":\r\n',
273 ' #-------------------- TODAY\'S ARTICLES\r\n',
274 ' self.write("<h2>Today\'s articles</h2>")\r\n',
275 ' showdate = frog.util.isodatestr() \r\n',
276 ' entries = readArticlesFromDate(showdate)\r\n',
277 'elif arg=="active":\r\n',
278 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
279 ' self.Yredirect("active.y")\r\n',
280 'elif arg=="login":\r\n',
281 ' #-------------------- LOGIN PAGE redirect\r\n',
282 ' self.Yredirect("login.y")\r\n',
283 'elif arg=="date":\r\n',
284 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
285 ' showdate = self.Request.getParameter("date")\r\n',
286 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
287 ' entries = readArticlesFromDate(showdate)\r\n',
288 'else:\r\n',
289 ' #-------------------- RECENT ARTICLES\r\n',
290 ' self.write("<h2>Recent articles</h2>")\r\n',
291 ' dates=storageEngine.listBlogEntryDates()\r\n',
292 ' if dates:\r\n',
293 ' entries=[]\r\n',
294 ' SHOWAMOUNT=10\r\n',
295 ' for showdate in dates:\r\n',
296 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
297 ' if len(entries)>=SHOWAMOUNT:\r\n',
298 ' break\r\n',
299 ' \r\n',
300 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000301 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200302 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000303 for (i, line) in enumerate(reader):
304 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000307 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200308 writer = codecs.getwriter(self.encoding)(q)
309 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000310
311 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000312 writer.write("foo\r")
313 self.assertEqual(reader.readline(keepends=False), "foo")
314 writer.write("\nbar\r")
315 self.assertEqual(reader.readline(keepends=False), "")
316 self.assertEqual(reader.readline(keepends=False), "bar")
317 writer.write("baz")
318 self.assertEqual(reader.readline(keepends=False), "baz")
319 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000320
321 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 writer.write("foo\r")
323 self.assertEqual(reader.readline(keepends=True), "foo\r")
324 writer.write("\nbar\r")
325 self.assertEqual(reader.readline(keepends=True), "\n")
326 self.assertEqual(reader.readline(keepends=True), "bar\r")
327 writer.write("baz")
328 self.assertEqual(reader.readline(keepends=True), "baz")
329 self.assertEqual(reader.readline(keepends=True), "")
330 writer.write("foo\r\n")
331 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332
Walter Dörwald9fa09462005-01-10 12:01:39 +0000333 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
335 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
336 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337
338 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000339 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200340 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000341 self.assertEqual(reader.readline(), s1)
342 self.assertEqual(reader.readline(), s2)
343 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
346 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000347 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
348 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
349 s3 = "stillokay:bbbbxx\r\n"
350 s4 = "broken!!!!badbad\r\n"
351 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000352
353 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000354 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200355 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000356 self.assertEqual(reader.readline(), s1)
357 self.assertEqual(reader.readline(), s2)
358 self.assertEqual(reader.readline(), s3)
359 self.assertEqual(reader.readline(), s4)
360 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000361 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000362
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200363 ill_formed_sequence_replace = "\ufffd"
364
365 def test_lone_surrogates(self):
366 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
367 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
368 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200369 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
370 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200371 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
372 "[&#56448;]".encode(self.encoding))
373 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
374 "[]".encode(self.encoding))
375 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
376 "[?]".encode(self.encoding))
377
Victor Stinner01ada392015-10-01 21:54:51 +0200378 # sequential surrogate characters
379 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
380 "[]".encode(self.encoding))
381 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
382 "[??]".encode(self.encoding))
383
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200384 bom = "".encode(self.encoding)
385 for before, after in [("\U00010fff", "A"), ("[", "]"),
386 ("A", "\U00010fff")]:
387 before_sequence = before.encode(self.encoding)[len(bom):]
388 after_sequence = after.encode(self.encoding)[len(bom):]
389 test_string = before + "\uDC80" + after
390 test_sequence = (bom + before_sequence +
391 self.ill_formed_sequence + after_sequence)
392 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
393 self.encoding)
394 self.assertEqual(test_string.encode(self.encoding,
395 "surrogatepass"),
396 test_sequence)
397 self.assertEqual(test_sequence.decode(self.encoding,
398 "surrogatepass"),
399 test_string)
400 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
401 before + after)
402 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
403 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200404 backslashreplace = ''.join('\\x%02x' % b
405 for b in self.ill_formed_sequence)
406 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
407 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200408
Victor Stinnerf96418d2015-09-21 23:06:27 +0200409
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200410class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000411 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200412 if sys.byteorder == 'little':
413 ill_formed_sequence = b"\x80\xdc\x00\x00"
414 else:
415 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416
417 spamle = (b'\xff\xfe\x00\x00'
418 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
419 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
420 spambe = (b'\x00\x00\xfe\xff'
421 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
422 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
423
424 def test_only_one_bom(self):
425 _,_,reader,writer = codecs.lookup(self.encoding)
426 # encode some stream
427 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200428 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000429 f.write("spam")
430 f.write("spam")
431 d = s.getvalue()
432 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000433 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000434 # try to read it back
435 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200436 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000437 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438
439 def test_badbom(self):
440 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200441 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 self.assertRaises(UnicodeError, f.read)
443
444 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 self.assertRaises(UnicodeError, f.read)
447
448 def test_partial(self):
449 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200450 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000451 [
452 "", # first byte of BOM read
453 "", # second byte of BOM read
454 "", # third byte of BOM read
455 "", # fourth byte of BOM read => byteorder known
456 "",
457 "",
458 "",
459 "\x00",
460 "\x00",
461 "\x00",
462 "\x00",
463 "\x00\xff",
464 "\x00\xff",
465 "\x00\xff",
466 "\x00\xff",
467 "\x00\xff\u0100",
468 "\x00\xff\u0100",
469 "\x00\xff\u0100",
470 "\x00\xff\u0100",
471 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200472 "\x00\xff\u0100\uffff",
473 "\x00\xff\u0100\uffff",
474 "\x00\xff\u0100\uffff",
475 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 ]
477 )
478
Georg Brandl791f4e12009-09-17 11:41:24 +0000479 def test_handlers(self):
480 self.assertEqual(('\ufffd', 1),
481 codecs.utf_32_decode(b'\x01', 'replace', True))
482 self.assertEqual(('', 1),
483 codecs.utf_32_decode(b'\x01', 'ignore', True))
484
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485 def test_errors(self):
486 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
487 b"\xff", "strict", True)
488
489 def test_decoder_state(self):
490 self.check_state_handling_decode(self.encoding,
491 "spamspam", self.spamle)
492 self.check_state_handling_decode(self.encoding,
493 "spamspam", self.spambe)
494
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000495 def test_issue8941(self):
496 # Issue #8941: insufficient result allocation when decoding into
497 # surrogate pairs on UCS-2 builds.
498 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
499 self.assertEqual('\U00010000' * 1024,
500 codecs.utf_32_decode(encoded_le)[0])
501 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
502 self.assertEqual('\U00010000' * 1024,
503 codecs.utf_32_decode(encoded_be)[0])
504
Victor Stinnerf96418d2015-09-21 23:06:27 +0200505
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200506class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000507 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200508 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509
510 def test_partial(self):
511 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200512 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000513 [
514 "",
515 "",
516 "",
517 "\x00",
518 "\x00",
519 "\x00",
520 "\x00",
521 "\x00\xff",
522 "\x00\xff",
523 "\x00\xff",
524 "\x00\xff",
525 "\x00\xff\u0100",
526 "\x00\xff\u0100",
527 "\x00\xff\u0100",
528 "\x00\xff\u0100",
529 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200530 "\x00\xff\u0100\uffff",
531 "\x00\xff\u0100\uffff",
532 "\x00\xff\u0100\uffff",
533 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000534 ]
535 )
536
537 def test_simple(self):
538 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
539
540 def test_errors(self):
541 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
542 b"\xff", "strict", True)
543
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000544 def test_issue8941(self):
545 # Issue #8941: insufficient result allocation when decoding into
546 # surrogate pairs on UCS-2 builds.
547 encoded = b'\x00\x00\x01\x00' * 1024
548 self.assertEqual('\U00010000' * 1024,
549 codecs.utf_32_le_decode(encoded)[0])
550
Victor Stinnerf96418d2015-09-21 23:06:27 +0200551
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200552class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000553 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200554 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000555
556 def test_partial(self):
557 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200558 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000559 [
560 "",
561 "",
562 "",
563 "\x00",
564 "\x00",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff",
570 "\x00\xff",
571 "\x00\xff\u0100",
572 "\x00\xff\u0100",
573 "\x00\xff\u0100",
574 "\x00\xff\u0100",
575 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200576 "\x00\xff\u0100\uffff",
577 "\x00\xff\u0100\uffff",
578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000580 ]
581 )
582
583 def test_simple(self):
584 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
585
586 def test_errors(self):
587 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
588 b"\xff", "strict", True)
589
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000590 def test_issue8941(self):
591 # Issue #8941: insufficient result allocation when decoding into
592 # surrogate pairs on UCS-2 builds.
593 encoded = b'\x00\x01\x00\x00' * 1024
594 self.assertEqual('\U00010000' * 1024,
595 codecs.utf_32_be_decode(encoded)[0])
596
597
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200598class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000599 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200600 if sys.byteorder == 'little':
601 ill_formed_sequence = b"\x80\xdc"
602 else:
603 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000604
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000605 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
606 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000607
608 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000609 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000610 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000611 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200612 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 f.write("spam")
614 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000615 d = s.getvalue()
616 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000617 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000618 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000619 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200620 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000621 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000622
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000623 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000624 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200625 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000626 self.assertRaises(UnicodeError, f.read)
627
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000628 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200629 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000630 self.assertRaises(UnicodeError, f.read)
631
Walter Dörwald69652032004-09-07 20:24:22 +0000632 def test_partial(self):
633 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200634 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000635 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000636 "", # first byte of BOM read
637 "", # second byte of BOM read => byteorder known
638 "",
639 "\x00",
640 "\x00",
641 "\x00\xff",
642 "\x00\xff",
643 "\x00\xff\u0100",
644 "\x00\xff\u0100",
645 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200646 "\x00\xff\u0100\uffff",
647 "\x00\xff\u0100\uffff",
648 "\x00\xff\u0100\uffff",
649 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000650 ]
651 )
652
Georg Brandl791f4e12009-09-17 11:41:24 +0000653 def test_handlers(self):
654 self.assertEqual(('\ufffd', 1),
655 codecs.utf_16_decode(b'\x01', 'replace', True))
656 self.assertEqual(('', 1),
657 codecs.utf_16_decode(b'\x01', 'ignore', True))
658
Walter Dörwalde22d3392005-11-17 08:52:34 +0000659 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000660 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000661 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000662
663 def test_decoder_state(self):
664 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000665 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000666 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000667 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000668
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000669 def test_bug691291(self):
670 # Files are always opened in binary mode, even if no binary mode was
671 # specified. This means that no automatic conversion of '\n' is done
672 # on reading and writing.
673 s1 = 'Hello\r\nworld\r\n'
674
675 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200676 self.addCleanup(support.unlink, support.TESTFN)
677 with open(support.TESTFN, 'wb') as fp:
678 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200679 with support.check_warnings(('', DeprecationWarning)):
680 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
681 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200682 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000683
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200684class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000685 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200686 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000687
688 def test_partial(self):
689 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200690 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000691 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000692 "",
693 "\x00",
694 "\x00",
695 "\x00\xff",
696 "\x00\xff",
697 "\x00\xff\u0100",
698 "\x00\xff\u0100",
699 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200700 "\x00\xff\u0100\uffff",
701 "\x00\xff\u0100\uffff",
702 "\x00\xff\u0100\uffff",
703 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000704 ]
705 )
706
Walter Dörwalde22d3392005-11-17 08:52:34 +0000707 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200708 tests = [
709 (b'\xff', '\ufffd'),
710 (b'A\x00Z', 'A\ufffd'),
711 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
712 (b'\x00\xd8', '\ufffd'),
713 (b'\x00\xd8A', '\ufffd'),
714 (b'\x00\xd8A\x00', '\ufffdA'),
715 (b'\x00\xdcA\x00', '\ufffdA'),
716 ]
717 for raw, expected in tests:
718 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
719 raw, 'strict', True)
720 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000721
Victor Stinner53a9dd72010-12-08 22:25:45 +0000722 def test_nonbmp(self):
723 self.assertEqual("\U00010203".encode(self.encoding),
724 b'\x00\xd8\x03\xde')
725 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
726 "\U00010203")
727
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200728class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000729 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200730 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000731
732 def test_partial(self):
733 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200734 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000735 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000736 "",
737 "\x00",
738 "\x00",
739 "\x00\xff",
740 "\x00\xff",
741 "\x00\xff\u0100",
742 "\x00\xff\u0100",
743 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200744 "\x00\xff\u0100\uffff",
745 "\x00\xff\u0100\uffff",
746 "\x00\xff\u0100\uffff",
747 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 ]
749 )
750
Walter Dörwalde22d3392005-11-17 08:52:34 +0000751 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200752 tests = [
753 (b'\xff', '\ufffd'),
754 (b'\x00A\xff', 'A\ufffd'),
755 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
756 (b'\xd8\x00', '\ufffd'),
757 (b'\xd8\x00\xdc', '\ufffd'),
758 (b'\xd8\x00\x00A', '\ufffdA'),
759 (b'\xdc\x00\x00A', '\ufffdA'),
760 ]
761 for raw, expected in tests:
762 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
763 raw, 'strict', True)
764 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000765
Victor Stinner53a9dd72010-12-08 22:25:45 +0000766 def test_nonbmp(self):
767 self.assertEqual("\U00010203".encode(self.encoding),
768 b'\xd8\x00\xde\x03')
769 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
770 "\U00010203")
771
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200772class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000773 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200774 ill_formed_sequence = b"\xed\xb2\x80"
775 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200776 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000777
778 def test_partial(self):
779 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200780 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000781 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000782 "\x00",
783 "\x00",
784 "\x00\xff",
785 "\x00\xff",
786 "\x00\xff\u07ff",
787 "\x00\xff\u07ff",
788 "\x00\xff\u07ff",
789 "\x00\xff\u07ff\u0800",
790 "\x00\xff\u07ff\u0800",
791 "\x00\xff\u07ff\u0800",
792 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200793 "\x00\xff\u07ff\u0800\uffff",
794 "\x00\xff\u07ff\u0800\uffff",
795 "\x00\xff\u07ff\u0800\uffff",
796 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000797 ]
798 )
799
Walter Dörwald3abcb012007-04-16 22:10:50 +0000800 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000802 self.check_state_handling_decode(self.encoding,
803 u, u.encode(self.encoding))
804
Victor Stinner1d65d912015-10-05 13:43:50 +0200805 def test_decode_error(self):
806 for data, error_handler, expected in (
807 (b'[\x80\xff]', 'ignore', '[]'),
808 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
809 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
810 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
811 ):
812 with self.subTest(data=data, error_handler=error_handler,
813 expected=expected):
814 self.assertEqual(data.decode(self.encoding, error_handler),
815 expected)
816
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000817 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200818 super().test_lone_surrogates()
819 # not sure if this is making sense for
820 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200821 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
822 self.BOM + b'[\x80]')
823
824 with self.assertRaises(UnicodeEncodeError) as cm:
825 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
826 exc = cm.exception
827 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000828
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000829 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200830 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
831 self.BOM + b"abc\xed\xa0\x80def")
832 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
833 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
834 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
835 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
836
837 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000838 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200839 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200841
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000842 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700843 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200844 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200845 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200846 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000847
Victor Stinnerf96418d2015-09-21 23:06:27 +0200848
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200849@unittest.skipUnless(sys.platform == 'win32',
850 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200851class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200852 encoding = "cp65001"
853
854 def test_encode(self):
855 tests = [
856 ('abc', 'strict', b'abc'),
857 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
858 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700859 ('\udc80', 'strict', None),
860 ('\udc80', 'ignore', b''),
861 ('\udc80', 'replace', b'?'),
862 ('\udc80', 'backslashreplace', b'\\udc80'),
863 ('\udc80', 'namereplace', b'\\udc80'),
864 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200865 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200866 for text, errors, expected in tests:
867 if expected is not None:
868 try:
869 encoded = text.encode('cp65001', errors)
870 except UnicodeEncodeError as err:
871 self.fail('Unable to encode %a to cp65001 with '
872 'errors=%r: %s' % (text, errors, err))
873 self.assertEqual(encoded, expected,
874 '%a.encode("cp65001", %r)=%a != %a'
875 % (text, errors, encoded, expected))
876 else:
877 self.assertRaises(UnicodeEncodeError,
878 text.encode, "cp65001", errors)
879
880 def test_decode(self):
881 tests = [
882 (b'abc', 'strict', 'abc'),
883 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
884 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
885 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
886 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
887 # invalid bytes
888 (b'[\xff]', 'strict', None),
889 (b'[\xff]', 'ignore', '[]'),
890 (b'[\xff]', 'replace', '[\ufffd]'),
891 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700892 (b'[\xed\xb2\x80]', 'strict', None),
893 (b'[\xed\xb2\x80]', 'ignore', '[]'),
894 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200895 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200896 for raw, errors, expected in tests:
897 if expected is not None:
898 try:
899 decoded = raw.decode('cp65001', errors)
900 except UnicodeDecodeError as err:
901 self.fail('Unable to decode %a from cp65001 with '
902 'errors=%r: %s' % (raw, errors, err))
903 self.assertEqual(decoded, expected,
904 '%a.decode("cp65001", %r)=%a != %a'
905 % (raw, errors, decoded, expected))
906 else:
907 self.assertRaises(UnicodeDecodeError,
908 raw.decode, 'cp65001', errors)
909
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200910 def test_lone_surrogates(self):
911 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
912 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
913 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
914 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200915 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
916 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200917 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
918 b'[&#56448;]')
919 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
920 b'[\x80]')
921 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
922 b'[]')
923 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
924 b'[?]')
925
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200926 def test_surrogatepass_handler(self):
927 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
928 b"abc\xed\xa0\x80def")
929 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
930 "abc\ud800def")
931 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
932 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
933 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
934 "\U00010fff\uD800")
935 self.assertTrue(codecs.lookup_error("surrogatepass"))
936
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200937
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200938class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939 encoding = "utf-7"
940
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300941 def test_ascii(self):
942 # Set D (directly encoded characters)
943 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
944 'abcdefghijklmnopqrstuvwxyz'
945 '0123456789'
946 '\'(),-./:?')
947 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
948 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
949 # Set O (optional direct characters)
950 set_o = ' !"#$%&*;<=>@[]^_`{|}'
951 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
952 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
953 # +
954 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
955 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
956 # White spaces
957 ws = ' \t\n\r'
958 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
959 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
960 # Other ASCII characters
961 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
962 set(set_d + set_o + '+' + ws)))
963 self.assertEqual(other_ascii.encode(self.encoding),
964 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
965 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
966
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 def test_partial(self):
968 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200969 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000970 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200971 'a',
972 'a',
973 'a+',
974 'a+-',
975 'a+-b',
976 'a+-b',
977 'a+-b',
978 'a+-b',
979 'a+-b',
980 'a+-b\x00',
981 'a+-b\x00c',
982 'a+-b\x00c',
983 'a+-b\x00c',
984 'a+-b\x00c',
985 'a+-b\x00c',
986 'a+-b\x00c\x80',
987 'a+-b\x00c\x80d',
988 'a+-b\x00c\x80d',
989 'a+-b\x00c\x80d',
990 'a+-b\x00c\x80d',
991 'a+-b\x00c\x80d',
992 'a+-b\x00c\x80d\u0100',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e',
997 'a+-b\x00c\x80d\u0100e',
998 'a+-b\x00c\x80d\u0100e',
999 'a+-b\x00c\x80d\u0100e',
1000 'a+-b\x00c\x80d\u0100e',
1001 'a+-b\x00c\x80d\u0100e\U00010000',
1002 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001003 ]
1004 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001005
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001006 def test_errors(self):
1007 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001008 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001009 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001010 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001011 (b'a+IK', 'a\ufffd'),
1012 (b'a+IK-b', 'a\ufffdb'),
1013 (b'a+IK,b', 'a\ufffdb'),
1014 (b'a+IKx', 'a\u20ac\ufffd'),
1015 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1016 (b'a+IKwgr', 'a\u20ac\ufffd'),
1017 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1018 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1019 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1020 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1021 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1022 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1023 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1024 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1025 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001026 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1027 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001028 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001029 ]
1030 for raw, expected in tests:
1031 with self.subTest(raw=raw):
1032 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1033 raw, 'strict', True)
1034 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1035
1036 def test_nonbmp(self):
1037 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1038 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1039 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001040 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1041 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1042 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1043 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1044 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1045 b'+IKwgrNgB3KA-')
1046 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1047 '\u20ac\u20ac\U000104A0')
1048 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1049 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001050
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001051 def test_lone_surrogates(self):
1052 tests = [
1053 (b'a+2AE-b', 'a\ud801b'),
1054 (b'a+2AE\xffb', 'a\ufffdb'),
1055 (b'a+2AE', 'a\ufffd'),
1056 (b'a+2AEA-b', 'a\ufffdb'),
1057 (b'a+2AH-b', 'a\ufffdb'),
1058 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1059 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1060 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1061 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1062 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1063 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1064 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1065 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1066 ]
1067 for raw, expected in tests:
1068 with self.subTest(raw=raw):
1069 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001070
1071
Walter Dörwalde22d3392005-11-17 08:52:34 +00001072class UTF16ExTest(unittest.TestCase):
1073
1074 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001075 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001076
1077 def test_bad_args(self):
1078 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1079
1080class ReadBufferTest(unittest.TestCase):
1081
1082 def test_array(self):
1083 import array
1084 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001085 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001086 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001087 )
1088
1089 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001090 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001091
1092 def test_bad_args(self):
1093 self.assertRaises(TypeError, codecs.readbuffer_encode)
1094 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1095
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001096class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001097 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001098 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001099
1100 def test_partial(self):
1101 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001102 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001103 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001104 "",
1105 "",
1106 "", # First BOM has been read and skipped
1107 "",
1108 "",
1109 "\ufeff", # Second BOM has been read and emitted
1110 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001111 "\ufeff\x00", # First byte of encoded "\xff" read
1112 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1113 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1114 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001115 "\ufeff\x00\xff\u07ff",
1116 "\ufeff\x00\xff\u07ff",
1117 "\ufeff\x00\xff\u07ff\u0800",
1118 "\ufeff\x00\xff\u07ff\u0800",
1119 "\ufeff\x00\xff\u07ff\u0800",
1120 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001121 "\ufeff\x00\xff\u07ff\u0800\uffff",
1122 "\ufeff\x00\xff\u07ff\u0800\uffff",
1123 "\ufeff\x00\xff\u07ff\u0800\uffff",
1124 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001125 ]
1126 )
1127
Thomas Wouters89f507f2006-12-13 04:49:30 +00001128 def test_bug1601501(self):
1129 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001130 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001131
Walter Dörwald3abcb012007-04-16 22:10:50 +00001132 def test_bom(self):
1133 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001135 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1136
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001137 def test_stream_bom(self):
1138 unistring = "ABC\u00A1\u2200XYZ"
1139 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1140
1141 reader = codecs.getreader("utf-8-sig")
1142 for sizehint in [None] + list(range(1, 11)) + \
1143 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001144 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001145 ostream = io.StringIO()
1146 while 1:
1147 if sizehint is not None:
1148 data = istream.read(sizehint)
1149 else:
1150 data = istream.read()
1151
1152 if not data:
1153 break
1154 ostream.write(data)
1155
1156 got = ostream.getvalue()
1157 self.assertEqual(got, unistring)
1158
1159 def test_stream_bare(self):
1160 unistring = "ABC\u00A1\u2200XYZ"
1161 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1162
1163 reader = codecs.getreader("utf-8-sig")
1164 for sizehint in [None] + list(range(1, 11)) + \
1165 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001166 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001167 ostream = io.StringIO()
1168 while 1:
1169 if sizehint is not None:
1170 data = istream.read(sizehint)
1171 else:
1172 data = istream.read()
1173
1174 if not data:
1175 break
1176 ostream.write(data)
1177
1178 got = ostream.getvalue()
1179 self.assertEqual(got, unistring)
1180
1181class EscapeDecodeTest(unittest.TestCase):
1182 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001183 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001184 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001185
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001186 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001188 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001189 b = bytes([b])
1190 if b != b'\\':
1191 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001192
1193 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001194 decode = codecs.escape_decode
1195 check = coding_checker(self, decode)
1196 check(b"[\\\n]", b"[]")
1197 check(br'[\"]', b'["]')
1198 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001199 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001200 check(br"[\a]", b"[\x07]")
1201 check(br"[\b]", b"[\x08]")
1202 check(br"[\t]", b"[\x09]")
1203 check(br"[\n]", b"[\x0a]")
1204 check(br"[\v]", b"[\x0b]")
1205 check(br"[\f]", b"[\x0c]")
1206 check(br"[\r]", b"[\x0d]")
1207 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001208 check(br"[\78]", b"[\x078]")
1209 check(br"[\41]", b"[!]")
1210 check(br"[\418]", b"[!8]")
1211 check(br"[\101]", b"[A]")
1212 check(br"[\1010]", b"[A0]")
1213 check(br"[\501]", b"[A]")
1214 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001215 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001216 for i in range(97, 123):
1217 b = bytes([i])
1218 if b not in b'abfnrtvx':
1219 with self.assertWarns(DeprecationWarning):
1220 check(b"\\" + b, b"\\" + b)
1221 with self.assertWarns(DeprecationWarning):
1222 check(b"\\" + b.upper(), b"\\" + b.upper())
1223 with self.assertWarns(DeprecationWarning):
1224 check(br"\8", b"\\8")
1225 with self.assertWarns(DeprecationWarning):
1226 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001227 with self.assertWarns(DeprecationWarning):
1228 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001229
1230 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001231 decode = codecs.escape_decode
1232 self.assertRaises(ValueError, decode, br"\x")
1233 self.assertRaises(ValueError, decode, br"[\x]")
1234 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1235 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1236 self.assertRaises(ValueError, decode, br"\x0")
1237 self.assertRaises(ValueError, decode, br"[\x0]")
1238 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1239 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001240
Victor Stinnerf96418d2015-09-21 23:06:27 +02001241
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242# From RFC 3492
1243punycode_testcases = [
1244 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001245 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1246 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001247 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001248 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001250 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001252 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001253 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1256 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1257 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001258 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1261 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1262 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001263 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001266 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1267 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1268 "\u0939\u0948\u0902",
1269 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270
1271 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001272 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001273 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1274 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275
1276 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1278 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1279 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001280 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1281 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001282
1283 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1285 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1286 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1287 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001288 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289
1290 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001291 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1292 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1293 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1294 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1295 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001296 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001297
1298 # (K) Vietnamese:
1299 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1300 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001301 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1302 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1303 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1304 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001305 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001306
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001308 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001309 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001310
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001312 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1313 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1314 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001315 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316
1317 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1319 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1320 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001321 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001322
1323 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001324 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001325 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326
1327 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001328 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1329 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001330 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001331
1332 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001333 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001334 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335
1336 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001337 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001338 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339
1340 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001341 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1342 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001343 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001344 ]
1345
1346for i in punycode_testcases:
1347 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001348 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349
Victor Stinnerf96418d2015-09-21 23:06:27 +02001350
Martin v. Löwis2548c732003-04-18 10:39:54 +00001351class PunycodeTest(unittest.TestCase):
1352 def test_encode(self):
1353 for uni, puny in punycode_testcases:
1354 # Need to convert both strings to lower case, since
1355 # some of the extended encodings use upper case, but our
1356 # code produces only lower case. Converting just puny to
1357 # lower is also insufficient, since some of the input characters
1358 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001359 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001360 str(uni.encode("punycode"), "ascii").lower(),
1361 str(puny, "ascii").lower()
1362 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363
1364 def test_decode(self):
1365 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001366 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001367 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001368 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369
Victor Stinnerf96418d2015-09-21 23:06:27 +02001370
Martin v. Löwis2548c732003-04-18 10:39:54 +00001371# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1372nameprep_tests = [
1373 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1375 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1376 b'\xb8\x8f\xef\xbb\xbf',
1377 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001379 (b'CAFE',
1380 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001381 # 3.3 Case folding 8bit U+00DF (german sharp s).
1382 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xc3\x9f',
1384 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xc4\xb0',
1387 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xc5\x83\xcd\xba',
1390 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1392 # XXX: skip this as it fails in UCS-2 mode
1393 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1394 # 'telc\xe2\x88\x95kg\xcf\x83'),
1395 (None, None),
1396 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1398 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xe1\xbe\xb7',
1401 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 # 3.9 Self-reverting case folding U+01F0 and normalization.
1403 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xc7\xb0',
1405 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xce\x90',
1408 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xce\xb0',
1411 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xe1\xba\x96',
1414 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xe1\xbd\x96',
1417 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b' ',
1420 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xc2\xa0',
1423 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xe2\x80\x80',
1429 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xe2\x80\x8b',
1432 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xe3\x80\x80',
1435 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\x10\x7f',
1438 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xef\xbb\xbf',
1447 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 None),
1454 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 None),
1469 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 None),
1475 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xcd\x81',
1477 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 None),
1481 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 None),
1484 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001485 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 None),
1487 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 None),
1490 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 None),
1493 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001495 None),
1496 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001497 (b'foo\xef\xb9\xb6bar',
1498 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 None),
1502 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001503 (b'\xd8\xa71\xd8\xa8',
1504 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001505 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001506 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001507 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001508 # None),
1509 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 # 3.44 Larger test (shrinking).
1511 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001512 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1513 b'\xaa\xce\xb0\xe2\x80\x80',
1514 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001515 # 3.45 Larger test (expanding).
1516 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001517 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1518 b'\x80',
1519 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1520 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1521 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001522 ]
1523
1524
1525class NameprepTest(unittest.TestCase):
1526 def test_nameprep(self):
1527 from encodings.idna import nameprep
1528 for pos, (orig, prepped) in enumerate(nameprep_tests):
1529 if orig is None:
1530 # Skipped
1531 continue
1532 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001533 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001534 if prepped is None:
1535 # Input contains prohibited characters
1536 self.assertRaises(UnicodeError, nameprep, orig)
1537 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001538 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001540 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001541 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001542 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001543
Victor Stinnerf96418d2015-09-21 23:06:27 +02001544
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545class IDNACodecTest(unittest.TestCase):
1546 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(str(b"python.org", "idna"), "python.org")
1548 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1549 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1550 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001551
1552 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001553 self.assertEqual("python.org".encode("idna"), b"python.org")
1554 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1555 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1556 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001557
Martin v. Löwis8b595142005-08-25 11:03:38 +00001558 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001559 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001560 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001562
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001563 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001564 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001565 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566 "python.org"
1567 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001568 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001569 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001570 "python.org."
1571 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001572 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001573 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001574 "pyth\xf6n.org."
1575 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001576 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001577 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001578 "pyth\xf6n.org."
1579 )
1580
1581 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001582 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1583 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1584 self.assertEqual(decoder.decode(b"rg"), "")
1585 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001586
1587 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001588 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1589 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1590 self.assertEqual(decoder.decode(b"rg."), "org.")
1591 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001592
1593 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001595 b"".join(codecs.iterencode("python.org", "idna")),
1596 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001597 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001598 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001599 b"".join(codecs.iterencode("python.org.", "idna")),
1600 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001601 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001602 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001603 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1604 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001605 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001606 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001607 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1608 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609 )
1610
1611 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(encoder.encode("\xe4x"), b"")
1613 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1614 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001615
1616 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001617 self.assertEqual(encoder.encode("\xe4x"), b"")
1618 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1619 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001620
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001621 def test_errors(self):
1622 """Only supports "strict" error handler"""
1623 "python.org".encode("idna", "strict")
1624 b"python.org".decode("idna", "strict")
1625 for errors in ("ignore", "replace", "backslashreplace",
1626 "surrogateescape"):
1627 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1628 self.assertRaises(Exception,
1629 b"python.org".decode, "idna", errors)
1630
Victor Stinnerf96418d2015-09-21 23:06:27 +02001631
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001632class CodecsModuleTest(unittest.TestCase):
1633
1634 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1636 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001637 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001638 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001639 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001640
Victor Stinnera57dfd02014-05-14 17:13:14 +02001641 # test keywords
1642 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1643 '\xe4\xf6\xfc')
1644 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1645 '[]')
1646
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001647 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001648 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1649 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001650 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001651 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001653 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001654
Victor Stinnera57dfd02014-05-14 17:13:14 +02001655 # test keywords
1656 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1657 b'\xe4\xf6\xfc')
1658 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1659 b'[]')
1660
Walter Dörwald063e1e82004-10-28 13:04:26 +00001661 def test_register(self):
1662 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001663 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001664
1665 def test_lookup(self):
1666 self.assertRaises(TypeError, codecs.lookup)
1667 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001668 self.assertRaises(LookupError, codecs.lookup, " ")
1669
1670 def test_getencoder(self):
1671 self.assertRaises(TypeError, codecs.getencoder)
1672 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1673
1674 def test_getdecoder(self):
1675 self.assertRaises(TypeError, codecs.getdecoder)
1676 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1677
1678 def test_getreader(self):
1679 self.assertRaises(TypeError, codecs.getreader)
1680 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1681
1682 def test_getwriter(self):
1683 self.assertRaises(TypeError, codecs.getwriter)
1684 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001685
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001686 def test_lookup_issue1813(self):
1687 # Issue #1813: under Turkish locales, lookup of some codecs failed
1688 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001689 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001690 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1691 try:
1692 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1693 except locale.Error:
1694 # Unsupported locale on this system
1695 self.skipTest('test needs Turkish locale')
1696 c = codecs.lookup('ASCII')
1697 self.assertEqual(c.name, 'ascii')
1698
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001699 def test_all(self):
1700 api = (
1701 "encode", "decode",
1702 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1703 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1704 "getencoder", "getdecoder", "getincrementalencoder",
1705 "getincrementaldecoder", "getreader", "getwriter",
1706 "register_error", "lookup_error",
1707 "strict_errors", "replace_errors", "ignore_errors",
1708 "xmlcharrefreplace_errors", "backslashreplace_errors",
1709 "namereplace_errors",
1710 "open", "EncodedFile",
1711 "iterencode", "iterdecode",
1712 "BOM", "BOM_BE", "BOM_LE",
1713 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1714 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1715 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1716 "StreamReaderWriter", "StreamRecoder",
1717 )
1718 self.assertCountEqual(api, codecs.__all__)
1719 for api in codecs.__all__:
1720 getattr(codecs, api)
1721
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001722 def test_open(self):
1723 self.addCleanup(support.unlink, support.TESTFN)
1724 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1725 with self.subTest(mode), \
1726 codecs.open(support.TESTFN, mode, 'ascii') as file:
1727 self.assertIsInstance(file, codecs.StreamReaderWriter)
1728
1729 def test_undefined(self):
1730 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1731 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1732 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1733 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1734 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1735 self.assertRaises(UnicodeError,
1736 codecs.encode, 'abc', 'undefined', errors)
1737 self.assertRaises(UnicodeError,
1738 codecs.decode, b'abc', 'undefined', errors)
1739
Victor Stinnerf96418d2015-09-21 23:06:27 +02001740
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001741class StreamReaderTest(unittest.TestCase):
1742
1743 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001744 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001745 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001746
1747 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001748 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001749 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001750
Victor Stinnerf96418d2015-09-21 23:06:27 +02001751
Thomas Wouters89f507f2006-12-13 04:49:30 +00001752class EncodedFileTest(unittest.TestCase):
1753
1754 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001755 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001756 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001757 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001758
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001759 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001760 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001761 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001762 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001763
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001764all_unicode_encodings = [
1765 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001766 "big5",
1767 "big5hkscs",
1768 "charmap",
1769 "cp037",
1770 "cp1006",
1771 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001772 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001773 "cp1140",
1774 "cp1250",
1775 "cp1251",
1776 "cp1252",
1777 "cp1253",
1778 "cp1254",
1779 "cp1255",
1780 "cp1256",
1781 "cp1257",
1782 "cp1258",
1783 "cp424",
1784 "cp437",
1785 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001786 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001787 "cp737",
1788 "cp775",
1789 "cp850",
1790 "cp852",
1791 "cp855",
1792 "cp856",
1793 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001794 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001795 "cp860",
1796 "cp861",
1797 "cp862",
1798 "cp863",
1799 "cp864",
1800 "cp865",
1801 "cp866",
1802 "cp869",
1803 "cp874",
1804 "cp875",
1805 "cp932",
1806 "cp949",
1807 "cp950",
1808 "euc_jis_2004",
1809 "euc_jisx0213",
1810 "euc_jp",
1811 "euc_kr",
1812 "gb18030",
1813 "gb2312",
1814 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001815 "hp_roman8",
1816 "hz",
1817 "idna",
1818 "iso2022_jp",
1819 "iso2022_jp_1",
1820 "iso2022_jp_2",
1821 "iso2022_jp_2004",
1822 "iso2022_jp_3",
1823 "iso2022_jp_ext",
1824 "iso2022_kr",
1825 "iso8859_1",
1826 "iso8859_10",
1827 "iso8859_11",
1828 "iso8859_13",
1829 "iso8859_14",
1830 "iso8859_15",
1831 "iso8859_16",
1832 "iso8859_2",
1833 "iso8859_3",
1834 "iso8859_4",
1835 "iso8859_5",
1836 "iso8859_6",
1837 "iso8859_7",
1838 "iso8859_8",
1839 "iso8859_9",
1840 "johab",
1841 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001842 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001843 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001844 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001845 "latin_1",
1846 "mac_cyrillic",
1847 "mac_greek",
1848 "mac_iceland",
1849 "mac_latin2",
1850 "mac_roman",
1851 "mac_turkish",
1852 "palmos",
1853 "ptcp154",
1854 "punycode",
1855 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856 "shift_jis",
1857 "shift_jis_2004",
1858 "shift_jisx0213",
1859 "tis_620",
1860 "unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001861 "utf_16",
1862 "utf_16_be",
1863 "utf_16_le",
1864 "utf_7",
1865 "utf_8",
1866]
1867
1868if hasattr(codecs, "mbcs_encode"):
1869 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001870if hasattr(codecs, "oem_encode"):
1871 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001872
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001873# The following encoding is not tested, because it's not supposed
1874# to work:
1875# "undefined"
1876
1877# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001878broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001879 "punycode",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001880]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001881
Victor Stinnerf96418d2015-09-21 23:06:27 +02001882
Walter Dörwald3abcb012007-04-16 22:10:50 +00001883class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001884 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001885 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001887 name = codecs.lookup(encoding).name
1888 if encoding.endswith("_codec"):
1889 name += "_codec"
1890 elif encoding == "latin_1":
1891 name = "latin_1"
1892 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001893
Inada Naoki6a16b182019-03-18 15:44:11 +09001894 (b, size) = codecs.getencoder(encoding)(s)
1895 self.assertEqual(size, len(s), "encoding=%r" % encoding)
1896 (chars, size) = codecs.getdecoder(encoding)(b)
1897 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001898
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001899 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001900 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001901 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001902 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001903 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001904 for c in s:
1905 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001906 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001907 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001908 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001909 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001910 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001911 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001912 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001913 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001914 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001915 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001916
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001917 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001918 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001919 try:
1920 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001921 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001922 pass
1923 else:
1924 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001925 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001926 for c in s:
1927 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001928 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001929 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001930 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001931 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001932 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001933 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001934 self.assertEqual(decodedresult, s,
1935 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001936
1937 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001938 result = "".join(codecs.iterdecode(
1939 codecs.iterencode(s, encoding), encoding))
1940 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001941
1942 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001943 result = "".join(codecs.iterdecode(
1944 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001945 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001946
Victor Stinner554f3f02010-06-16 23:33:54 +00001947 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001948 # check incremental decoder/encoder with errors argument
1949 try:
1950 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001951 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001952 pass
1953 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001954 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001955 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001956 decodedresult = "".join(decoder.decode(bytes([c]))
1957 for c in encodedresult)
1958 self.assertEqual(decodedresult, s,
1959 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001960
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001961 @support.cpython_only
1962 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001963 s = "abc123" # all codecs should be able to encode these
1964 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001965 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001966 # check incremental decoder/encoder (fetched via the C API)
1967 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001968 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001969 except LookupError: # no IncrementalEncoder
1970 pass
1971 else:
1972 # check C API
1973 encodedresult = b""
1974 for c in s:
1975 encodedresult += cencoder.encode(c)
1976 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001977 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001978 decodedresult = ""
1979 for c in encodedresult:
1980 decodedresult += cdecoder.decode(bytes([c]))
1981 decodedresult += cdecoder.decode(b"", True)
1982 self.assertEqual(decodedresult, s,
1983 "encoding=%r" % encoding)
1984
1985 if encoding not in ("idna", "mbcs"):
1986 # check incremental decoder/encoder with errors argument
1987 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02001988 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001989 except LookupError: # no IncrementalEncoder
1990 pass
1991 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001992 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02001993 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001994 decodedresult = "".join(cdecoder.decode(bytes([c]))
1995 for c in encodedresult)
1996 self.assertEqual(decodedresult, s,
1997 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001998
Walter Dörwald729c31f2005-03-14 19:06:30 +00001999 def test_seek(self):
2000 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002001 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002002 for encoding in all_unicode_encodings:
2003 if encoding == "idna": # FIXME: See SF bug #1163178
2004 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002005 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002006 continue
Victor Stinner05010702011-05-27 16:50:40 +02002007 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002008 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002009 # Test that calling seek resets the internal codec state and buffers
2010 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002011 data = reader.read()
2012 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002013
Walter Dörwalde22d3392005-11-17 08:52:34 +00002014 def test_bad_decode_args(self):
2015 for encoding in all_unicode_encodings:
2016 decoder = codecs.getdecoder(encoding)
2017 self.assertRaises(TypeError, decoder)
2018 if encoding not in ("idna", "punycode"):
2019 self.assertRaises(TypeError, decoder, 42)
2020
2021 def test_bad_encode_args(self):
2022 for encoding in all_unicode_encodings:
2023 encoder = codecs.getencoder(encoding)
Inada Naoki6a16b182019-03-18 15:44:11 +09002024 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002025
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002026 def test_encoding_map_type_initialized(self):
2027 from encodings import cp1140
2028 # This used to crash, we are only verifying there's no crash.
2029 table_type = type(cp1140.encoding_table)
2030 self.assertEqual(table_type, table_type)
2031
Walter Dörwald3abcb012007-04-16 22:10:50 +00002032 def test_decoder_state(self):
2033 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002034 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002035 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002036 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002037 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2038 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2039
Victor Stinnerf96418d2015-09-21 23:06:27 +02002040
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002041class CharmapTest(unittest.TestCase):
2042 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002043 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002044 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002045 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002046 )
2047
Ezio Melottib3aedd42010-11-20 19:04:17 +00002048 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002049 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2050 ("\U0010FFFFbc", 3)
2051 )
2052
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002053 self.assertRaises(UnicodeDecodeError,
2054 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2055 )
2056
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002057 self.assertRaises(UnicodeDecodeError,
2058 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2059 )
2060
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002061 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002062 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002063 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002064 )
2065
Ezio Melottib3aedd42010-11-20 19:04:17 +00002066 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002067 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002068 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002069 )
2070
Ezio Melottib3aedd42010-11-20 19:04:17 +00002071 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002072 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2073 ("ab\\x02", 3)
2074 )
2075
2076 self.assertEqual(
2077 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2078 ("ab\\x02", 3)
2079 )
2080
2081 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002082 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002083 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002084 )
2085
Ezio Melottib3aedd42010-11-20 19:04:17 +00002086 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002087 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002088 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002089 )
2090
Guido van Rossum805365e2007-05-07 22:24:25 +00002091 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002092 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002093 codecs.charmap_decode(allbytes, "ignore", ""),
2094 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002095 )
2096
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002097 def test_decode_with_int2str_map(self):
2098 self.assertEqual(
2099 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2100 {0: 'a', 1: 'b', 2: 'c'}),
2101 ("abc", 3)
2102 )
2103
2104 self.assertEqual(
2105 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2106 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2107 ("AaBbCc", 3)
2108 )
2109
2110 self.assertEqual(
2111 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2112 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2113 ("\U0010FFFFbc", 3)
2114 )
2115
2116 self.assertEqual(
2117 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2118 {0: 'a', 1: 'b', 2: ''}),
2119 ("ab", 3)
2120 )
2121
2122 self.assertRaises(UnicodeDecodeError,
2123 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2124 {0: 'a', 1: 'b'}
2125 )
2126
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002127 self.assertRaises(UnicodeDecodeError,
2128 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2129 {0: 'a', 1: 'b', 2: None}
2130 )
2131
2132 # Issue #14850
2133 self.assertRaises(UnicodeDecodeError,
2134 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2135 {0: 'a', 1: 'b', 2: '\ufffe'}
2136 )
2137
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002138 self.assertEqual(
2139 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2140 {0: 'a', 1: 'b'}),
2141 ("ab\ufffd", 3)
2142 )
2143
2144 self.assertEqual(
2145 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2146 {0: 'a', 1: 'b', 2: None}),
2147 ("ab\ufffd", 3)
2148 )
2149
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002150 # Issue #14850
2151 self.assertEqual(
2152 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2153 {0: 'a', 1: 'b', 2: '\ufffe'}),
2154 ("ab\ufffd", 3)
2155 )
2156
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002157 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002158 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2159 {0: 'a', 1: 'b'}),
2160 ("ab\\x02", 3)
2161 )
2162
2163 self.assertEqual(
2164 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2165 {0: 'a', 1: 'b', 2: None}),
2166 ("ab\\x02", 3)
2167 )
2168
2169 # Issue #14850
2170 self.assertEqual(
2171 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2172 {0: 'a', 1: 'b', 2: '\ufffe'}),
2173 ("ab\\x02", 3)
2174 )
2175
2176 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002177 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2178 {0: 'a', 1: 'b'}),
2179 ("ab", 3)
2180 )
2181
2182 self.assertEqual(
2183 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2184 {0: 'a', 1: 'b', 2: None}),
2185 ("ab", 3)
2186 )
2187
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002188 # Issue #14850
2189 self.assertEqual(
2190 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2191 {0: 'a', 1: 'b', 2: '\ufffe'}),
2192 ("ab", 3)
2193 )
2194
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002195 allbytes = bytes(range(256))
2196 self.assertEqual(
2197 codecs.charmap_decode(allbytes, "ignore", {}),
2198 ("", len(allbytes))
2199 )
2200
2201 def test_decode_with_int2int_map(self):
2202 a = ord('a')
2203 b = ord('b')
2204 c = ord('c')
2205
2206 self.assertEqual(
2207 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2208 {0: a, 1: b, 2: c}),
2209 ("abc", 3)
2210 )
2211
2212 # Issue #15379
2213 self.assertEqual(
2214 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2215 {0: 0x10FFFF, 1: b, 2: c}),
2216 ("\U0010FFFFbc", 3)
2217 )
2218
Antoine Pitroua1f76552012-09-23 20:00:04 +02002219 self.assertEqual(
2220 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2221 {0: sys.maxunicode, 1: b, 2: c}),
2222 (chr(sys.maxunicode) + "bc", 3)
2223 )
2224
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002225 self.assertRaises(TypeError,
2226 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002227 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002228 )
2229
2230 self.assertRaises(UnicodeDecodeError,
2231 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2232 {0: a, 1: b},
2233 )
2234
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002235 self.assertRaises(UnicodeDecodeError,
2236 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2237 {0: a, 1: b, 2: 0xFFFE},
2238 )
2239
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002240 self.assertEqual(
2241 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2242 {0: a, 1: b}),
2243 ("ab\ufffd", 3)
2244 )
2245
2246 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002247 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2248 {0: a, 1: b, 2: 0xFFFE}),
2249 ("ab\ufffd", 3)
2250 )
2251
2252 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002253 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2254 {0: a, 1: b}),
2255 ("ab\\x02", 3)
2256 )
2257
2258 self.assertEqual(
2259 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2260 {0: a, 1: b, 2: 0xFFFE}),
2261 ("ab\\x02", 3)
2262 )
2263
2264 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002265 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2266 {0: a, 1: b}),
2267 ("ab", 3)
2268 )
2269
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002270 self.assertEqual(
2271 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2272 {0: a, 1: b, 2: 0xFFFE}),
2273 ("ab", 3)
2274 )
2275
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002276
Thomas Wouters89f507f2006-12-13 04:49:30 +00002277class WithStmtTest(unittest.TestCase):
2278 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002279 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002280 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2281 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002282 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002283
2284 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002285 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002286 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002287 with codecs.StreamReaderWriter(f, info.streamreader,
2288 info.streamwriter, 'strict') as srw:
2289 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002290
Victor Stinnerf96418d2015-09-21 23:06:27 +02002291
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002292class TypesTest(unittest.TestCase):
2293 def test_decode_unicode(self):
2294 # Most decoders don't accept unicode input
2295 decoders = [
2296 codecs.utf_7_decode,
2297 codecs.utf_8_decode,
2298 codecs.utf_16_le_decode,
2299 codecs.utf_16_be_decode,
2300 codecs.utf_16_ex_decode,
2301 codecs.utf_32_decode,
2302 codecs.utf_32_le_decode,
2303 codecs.utf_32_be_decode,
2304 codecs.utf_32_ex_decode,
2305 codecs.latin_1_decode,
2306 codecs.ascii_decode,
2307 codecs.charmap_decode,
2308 ]
2309 if hasattr(codecs, "mbcs_decode"):
2310 decoders.append(codecs.mbcs_decode)
2311 for decoder in decoders:
2312 self.assertRaises(TypeError, decoder, "xxx")
2313
2314 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002315 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002316 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002317 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2318 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2319 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2320 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002321
Victor Stinnere3b47152011-12-09 20:49:49 +01002322 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2323 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002324 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2325 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002326
2327 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2328 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002329 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2330 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002331
Serhiy Storchakad6793772013-01-29 10:20:44 +02002332
2333class UnicodeEscapeTest(unittest.TestCase):
2334 def test_empty(self):
2335 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2336 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2337
2338 def test_raw_encode(self):
2339 encode = codecs.unicode_escape_encode
2340 for b in range(32, 127):
2341 if b != b'\\'[0]:
2342 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2343
2344 def test_raw_decode(self):
2345 decode = codecs.unicode_escape_decode
2346 for b in range(256):
2347 if b != b'\\'[0]:
2348 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2349
2350 def test_escape_encode(self):
2351 encode = codecs.unicode_escape_encode
2352 check = coding_checker(self, encode)
2353 check('\t', br'\t')
2354 check('\n', br'\n')
2355 check('\r', br'\r')
2356 check('\\', br'\\')
2357 for b in range(32):
2358 if chr(b) not in '\t\n\r':
2359 check(chr(b), ('\\x%02x' % b).encode())
2360 for b in range(127, 256):
2361 check(chr(b), ('\\x%02x' % b).encode())
2362 check('\u20ac', br'\u20ac')
2363 check('\U0001d120', br'\U0001d120')
2364
2365 def test_escape_decode(self):
2366 decode = codecs.unicode_escape_decode
2367 check = coding_checker(self, decode)
2368 check(b"[\\\n]", "[]")
2369 check(br'[\"]', '["]')
2370 check(br"[\']", "[']")
2371 check(br"[\\]", r"[\]")
2372 check(br"[\a]", "[\x07]")
2373 check(br"[\b]", "[\x08]")
2374 check(br"[\t]", "[\x09]")
2375 check(br"[\n]", "[\x0a]")
2376 check(br"[\v]", "[\x0b]")
2377 check(br"[\f]", "[\x0c]")
2378 check(br"[\r]", "[\x0d]")
2379 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002380 check(br"[\78]", "[\x078]")
2381 check(br"[\41]", "[!]")
2382 check(br"[\418]", "[!8]")
2383 check(br"[\101]", "[A]")
2384 check(br"[\1010]", "[A0]")
2385 check(br"[\x41]", "[A]")
2386 check(br"[\x410]", "[A0]")
2387 check(br"\u20ac", "\u20ac")
2388 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002389 for i in range(97, 123):
2390 b = bytes([i])
2391 if b not in b'abfnrtuvx':
2392 with self.assertWarns(DeprecationWarning):
2393 check(b"\\" + b, "\\" + chr(i))
2394 if b.upper() not in b'UN':
2395 with self.assertWarns(DeprecationWarning):
2396 check(b"\\" + b.upper(), "\\" + chr(i-32))
2397 with self.assertWarns(DeprecationWarning):
2398 check(br"\8", "\\8")
2399 with self.assertWarns(DeprecationWarning):
2400 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002401 with self.assertWarns(DeprecationWarning):
2402 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002403
2404 def test_decode_errors(self):
2405 decode = codecs.unicode_escape_decode
2406 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2407 for i in range(d):
2408 self.assertRaises(UnicodeDecodeError, decode,
2409 b"\\" + c + b"0"*i)
2410 self.assertRaises(UnicodeDecodeError, decode,
2411 b"[\\" + c + b"0"*i + b"]")
2412 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2413 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2414 self.assertEqual(decode(data, "replace"),
2415 ("[\ufffd]\ufffd", len(data)))
2416 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2417 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2418 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2419
2420
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002421class RawUnicodeEscapeTest(unittest.TestCase):
2422 def test_empty(self):
2423 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2424 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2425
2426 def test_raw_encode(self):
2427 encode = codecs.raw_unicode_escape_encode
2428 for b in range(256):
2429 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2430
2431 def test_raw_decode(self):
2432 decode = codecs.raw_unicode_escape_decode
2433 for b in range(256):
2434 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2435
2436 def test_escape_encode(self):
2437 encode = codecs.raw_unicode_escape_encode
2438 check = coding_checker(self, encode)
2439 for b in range(256):
2440 if b not in b'uU':
2441 check('\\' + chr(b), b'\\' + bytes([b]))
2442 check('\u20ac', br'\u20ac')
2443 check('\U0001d120', br'\U0001d120')
2444
2445 def test_escape_decode(self):
2446 decode = codecs.raw_unicode_escape_decode
2447 check = coding_checker(self, decode)
2448 for b in range(256):
2449 if b not in b'uU':
2450 check(b'\\' + bytes([b]), '\\' + chr(b))
2451 check(br"\u20ac", "\u20ac")
2452 check(br"\U0001d120", "\U0001d120")
2453
2454 def test_decode_errors(self):
2455 decode = codecs.raw_unicode_escape_decode
2456 for c, d in (b'u', 4), (b'U', 4):
2457 for i in range(d):
2458 self.assertRaises(UnicodeDecodeError, decode,
2459 b"\\" + c + b"0"*i)
2460 self.assertRaises(UnicodeDecodeError, decode,
2461 b"[\\" + c + b"0"*i + b"]")
2462 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2463 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2464 self.assertEqual(decode(data, "replace"),
2465 ("[\ufffd]\ufffd", len(data)))
2466 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2467 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2468 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2469
2470
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002471class EscapeEncodeTest(unittest.TestCase):
2472
2473 def test_escape_encode(self):
2474 tests = [
2475 (b'', (b'', 0)),
2476 (b'foobar', (b'foobar', 6)),
2477 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2478 (b'a\'b', (b"a\\'b", 3)),
2479 (b'b\\c', (b'b\\\\c', 3)),
2480 (b'c\nd', (b'c\\nd', 3)),
2481 (b'd\re', (b'd\\re', 3)),
2482 (b'f\x7fg', (b'f\\x7fg', 3)),
2483 ]
2484 for data, output in tests:
2485 with self.subTest(data=data):
2486 self.assertEqual(codecs.escape_encode(data), output)
2487 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2488 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2489
2490
Martin v. Löwis43c57782009-05-10 08:15:24 +00002491class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002492
2493 def test_utf8(self):
2494 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002495 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002496 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002497 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002498 b"foo\x80bar")
2499 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002500 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002501 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002502 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002503 b"\xed\xb0\x80")
2504
2505 def test_ascii(self):
2506 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002507 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002508 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002509 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002510 b"foo\x80bar")
2511
2512 def test_charmap(self):
2513 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002514 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002515 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002516 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002517 b"foo\xa5bar")
2518
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002519 def test_latin1(self):
2520 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002521 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002522 b"\xe4\xeb\xef\xf6\xfc")
2523
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002524
Victor Stinner3fed0872010-05-22 02:16:27 +00002525class BomTest(unittest.TestCase):
2526 def test_seek0(self):
2527 data = "1234567890"
2528 tests = ("utf-16",
2529 "utf-16-le",
2530 "utf-16-be",
2531 "utf-32",
2532 "utf-32-le",
2533 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002534 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002535 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002536 # Check if the BOM is written only once
2537 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002538 f.write(data)
2539 f.write(data)
2540 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002541 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002542 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002543 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002544
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002545 # Check that the BOM is written after a seek(0)
2546 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2547 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002548 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002549 f.seek(0)
2550 f.write(data)
2551 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002552 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002553
2554 # (StreamWriter) Check that the BOM is written after a seek(0)
2555 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002556 f.writer.write(data[0])
2557 self.assertNotEqual(f.writer.tell(), 0)
2558 f.writer.seek(0)
2559 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002560 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002561 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002562
Victor Stinner05010702011-05-27 16:50:40 +02002563 # Check that the BOM is not written after a seek() at a position
2564 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002565 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2566 f.write(data)
2567 f.seek(f.tell())
2568 f.write(data)
2569 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002570 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002571
Victor Stinner05010702011-05-27 16:50:40 +02002572 # (StreamWriter) Check that the BOM is not written after a seek()
2573 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002574 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002575 f.writer.write(data)
2576 f.writer.seek(f.writer.tell())
2577 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002578 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002579 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002580
Victor Stinner3fed0872010-05-22 02:16:27 +00002581
Georg Brandl02524622010-12-02 18:06:51 +00002582bytes_transform_encodings = [
2583 "base64_codec",
2584 "uu_codec",
2585 "quopri_codec",
2586 "hex_codec",
2587]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002588
2589transform_aliases = {
2590 "base64_codec": ["base64", "base_64"],
2591 "uu_codec": ["uu"],
2592 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2593 "hex_codec": ["hex"],
2594 "rot_13": ["rot13"],
2595}
2596
Georg Brandl02524622010-12-02 18:06:51 +00002597try:
2598 import zlib
2599except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002600 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002601else:
2602 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002603 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002604try:
2605 import bz2
2606except ImportError:
2607 pass
2608else:
2609 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002610 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002611
Victor Stinnerf96418d2015-09-21 23:06:27 +02002612
Georg Brandl02524622010-12-02 18:06:51 +00002613class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002614
Georg Brandl02524622010-12-02 18:06:51 +00002615 def test_basics(self):
2616 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002617 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002618 with self.subTest(encoding=encoding):
2619 # generic codecs interface
2620 (o, size) = codecs.getencoder(encoding)(binput)
2621 self.assertEqual(size, len(binput))
2622 (i, size) = codecs.getdecoder(encoding)(o)
2623 self.assertEqual(size, len(o))
2624 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002625
Georg Brandl02524622010-12-02 18:06:51 +00002626 def test_read(self):
2627 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002628 with self.subTest(encoding=encoding):
2629 sin = codecs.encode(b"\x80", encoding)
2630 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2631 sout = reader.read()
2632 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002633
2634 def test_readline(self):
2635 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002636 with self.subTest(encoding=encoding):
2637 sin = codecs.encode(b"\x80", encoding)
2638 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2639 sout = reader.readline()
2640 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002641
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002642 def test_buffer_api_usage(self):
2643 # We check all the transform codecs accept memoryview input
2644 # for encoding and decoding
2645 # and also that they roundtrip correctly
2646 original = b"12345\x80"
2647 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002648 with self.subTest(encoding=encoding):
2649 data = original
2650 view = memoryview(data)
2651 data = codecs.encode(data, encoding)
2652 view_encoded = codecs.encode(view, encoding)
2653 self.assertEqual(view_encoded, data)
2654 view = memoryview(data)
2655 data = codecs.decode(data, encoding)
2656 self.assertEqual(data, original)
2657 view_decoded = codecs.decode(view, encoding)
2658 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002659
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002660 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002661 # Check binary -> binary codecs give a good error for str input
2662 bad_input = "bad input type"
2663 for encoding in bytes_transform_encodings:
2664 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002665 fmt = (r"{!r} is not a text encoding; "
2666 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002667 msg = fmt.format(encoding)
2668 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002669 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002670 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002671
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002672 def test_text_to_binary_blacklists_text_transforms(self):
2673 # Check str.encode gives a good error message for str -> str codecs
2674 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002675 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 with self.assertRaisesRegex(LookupError, msg):
2677 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002678
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002679 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002680 # Check bytes.decode and bytearray.decode give a good error
2681 # message for binary -> binary codecs
2682 data = b"encode first to ensure we meet any format restrictions"
2683 for encoding in bytes_transform_encodings:
2684 with self.subTest(encoding=encoding):
2685 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002686 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002687 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002688 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002689 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002690 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002691 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002692 bytearray(encoded_data).decode(encoding)
2693
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002694 def test_binary_to_text_blacklists_text_transforms(self):
2695 # Check str -> str codec gives a good error for binary input
2696 for bad_input in (b"immutable", bytearray(b"mutable")):
2697 with self.subTest(bad_input=bad_input):
2698 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002699 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002700 with self.assertRaisesRegex(LookupError, msg) as failure:
2701 bad_input.decode("rot_13")
2702 self.assertIsNone(failure.exception.__cause__)
2703
Zachary Wareefa2e042013-12-30 14:54:11 -06002704 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002705 def test_custom_zlib_error_is_wrapped(self):
2706 # Check zlib codec gives a good error for malformed input
2707 msg = "^decoding with 'zlib_codec' codec failed"
2708 with self.assertRaisesRegex(Exception, msg) as failure:
2709 codecs.decode(b"hello", "zlib_codec")
2710 self.assertIsInstance(failure.exception.__cause__,
2711 type(failure.exception))
2712
2713 def test_custom_hex_error_is_wrapped(self):
2714 # Check hex codec gives a good error for malformed input
2715 msg = "^decoding with 'hex_codec' codec failed"
2716 with self.assertRaisesRegex(Exception, msg) as failure:
2717 codecs.decode(b"hello", "hex_codec")
2718 self.assertIsInstance(failure.exception.__cause__,
2719 type(failure.exception))
2720
2721 # Unfortunately, the bz2 module throws OSError, which the codec
2722 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002723
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002724 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2725 def test_aliases(self):
2726 for codec_name, aliases in transform_aliases.items():
2727 expected_name = codecs.lookup(codec_name).name
2728 for alias in aliases:
2729 with self.subTest(alias=alias):
2730 info = codecs.lookup(alias)
2731 self.assertEqual(info.name, expected_name)
2732
Martin Panter06171bd2015-09-12 00:34:28 +00002733 def test_quopri_stateless(self):
2734 # Should encode with quotetabs=True
2735 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2736 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2737 # But should still support unescaped tabs and spaces
2738 unescaped = b"space tab eol\n"
2739 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2740
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002741 def test_uu_invalid(self):
2742 # Missing "begin" line
2743 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2744
Nick Coghlan8b097b42013-11-13 23:49:21 +10002745
2746# The codec system tries to wrap exceptions in order to ensure the error
2747# mentions the operation being performed and the codec involved. We
2748# currently *only* want this to happen for relatively stateless
2749# exceptions, where the only significant information they contain is their
2750# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002751
2752# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002753# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002754_TEST_CODECS = {}
2755
2756def _get_test_codec(codec_name):
2757 return _TEST_CODECS.get(codec_name)
2758codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2759
Nick Coghlan8fad1672014-09-15 23:50:44 +12002760try:
2761 # Issue #22166: Also need to clear the internal cache in CPython
2762 from _codecs import _forget_codec
2763except ImportError:
2764 def _forget_codec(codec_name):
2765 pass
2766
2767
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768class ExceptionChainingTest(unittest.TestCase):
2769
2770 def setUp(self):
2771 # There's no way to unregister a codec search function, so we just
2772 # ensure we render this one fairly harmless after the test
2773 # case finishes by using the test case repr as the codec name
2774 # The codecs module normalizes codec names, although this doesn't
2775 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002776 # We also make sure we use a truly unique id for the custom codec
2777 # to avoid issues with the codec cache when running these tests
2778 # multiple times (e.g. when hunting for refleaks)
2779 unique_id = repr(self) + str(id(self))
2780 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2781
2782 # We store the object to raise on the instance because of a bad
2783 # interaction between the codec caching (which means we can't
2784 # recreate the codec entry) and regrtest refleak hunting (which
2785 # runs the same test instance multiple times). This means we
2786 # need to ensure the codecs call back in to the instance to find
2787 # out which exception to raise rather than binding them in a
2788 # closure to an object that may change on the next run
2789 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002790
Nick Coghlan4e553e22013-11-16 00:35:34 +10002791 def tearDown(self):
2792 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002793 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2794 encodings._cache.pop(self.codec_name, None)
2795 try:
2796 _forget_codec(self.codec_name)
2797 except KeyError:
2798 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002799
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002800 def set_codec(self, encode, decode):
2801 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002802 name=self.codec_name)
2803 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002804
2805 @contextlib.contextmanager
2806 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002807 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002808 operation, self.codec_name, exc_type.__name__, msg)
2809 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2810 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002811 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002812 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002813
2814 def raise_obj(self, *args, **kwds):
2815 # Helper to dynamically change the object raised by a test codec
2816 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002817
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002818 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002819 self.obj_to_raise = obj_to_raise
2820 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002821 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002822 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002823 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002824 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002825 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002826 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002827 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002828 codecs.decode(b"bytes input", self.codec_name)
2829
2830 def test_raise_by_type(self):
2831 self.check_wrapped(RuntimeError, "")
2832
2833 def test_raise_by_value(self):
2834 msg = "This should be wrapped"
2835 self.check_wrapped(RuntimeError(msg), msg)
2836
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002837 def test_raise_grandchild_subclass_exact_size(self):
2838 msg = "This should be wrapped"
2839 class MyRuntimeError(RuntimeError):
2840 __slots__ = ()
2841 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2842
2843 def test_raise_subclass_with_weakref_support(self):
2844 msg = "This should be wrapped"
2845 class MyRuntimeError(RuntimeError):
2846 pass
2847 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2848
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002849 def check_not_wrapped(self, obj_to_raise, msg):
2850 def raise_obj(*args, **kwds):
2851 raise obj_to_raise
2852 self.set_codec(raise_obj, raise_obj)
2853 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002854 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002855 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002856 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002857 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002858 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002859 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002860 codecs.decode(b"bytes input", self.codec_name)
2861
2862 def test_init_override_is_not_wrapped(self):
2863 class CustomInit(RuntimeError):
2864 def __init__(self):
2865 pass
2866 self.check_not_wrapped(CustomInit, "")
2867
2868 def test_new_override_is_not_wrapped(self):
2869 class CustomNew(RuntimeError):
2870 def __new__(cls):
2871 return super().__new__(cls)
2872 self.check_not_wrapped(CustomNew, "")
2873
2874 def test_instance_attribute_is_not_wrapped(self):
2875 msg = "This should NOT be wrapped"
2876 exc = RuntimeError(msg)
2877 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002878 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002879
2880 def test_non_str_arg_is_not_wrapped(self):
2881 self.check_not_wrapped(RuntimeError(1), "1")
2882
2883 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002884 msg_re = r"^\('a', 'b', 'c'\)$"
2885 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002886
2887 # http://bugs.python.org/issue19609
2888 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002889 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002890 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002891 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002892 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002893 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002894 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002895 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002896 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002897 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002898 codecs.decode(b"bytes input", self.codec_name)
2899
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002900 def test_unflagged_non_text_codec_handling(self):
2901 # The stdlib non-text codecs are now marked so they're
2902 # pre-emptively skipped by the text model related methods
2903 # However, third party codecs won't be flagged, so we still make
2904 # sure the case where an inappropriate output type is produced is
2905 # handled appropriately
2906 def encode_to_str(*args, **kwds):
2907 return "not bytes!", 0
2908 def decode_to_bytes(*args, **kwds):
2909 return b"not str!", 0
2910 self.set_codec(encode_to_str, decode_to_bytes)
2911 # No input or output type checks on the codecs module functions
2912 encoded = codecs.encode(None, self.codec_name)
2913 self.assertEqual(encoded, "not bytes!")
2914 decoded = codecs.decode(None, self.codec_name)
2915 self.assertEqual(decoded, b"not str!")
2916 # Text model methods should complain
2917 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002918 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002919 msg = fmt.format(self.codec_name)
2920 with self.assertRaisesRegex(TypeError, msg):
2921 "str_input".encode(self.codec_name)
2922 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04002923 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002924 msg = fmt.format(self.codec_name)
2925 with self.assertRaisesRegex(TypeError, msg):
2926 b"bytes input".decode(self.codec_name)
2927
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002928
Georg Brandl02524622010-12-02 18:06:51 +00002929
Victor Stinner62be4fb2011-10-18 21:46:37 +02002930@unittest.skipUnless(sys.platform == 'win32',
2931 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002932class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002933 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002934 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002935
Victor Stinner3a50e702011-10-18 21:21:00 +02002936 def test_invalid_code_page(self):
2937 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2938 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002939 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2940 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002941
2942 def test_code_page_name(self):
2943 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2944 codecs.code_page_encode, 932, '\xff')
2945 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002946 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002947 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002948 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002949
2950 def check_decode(self, cp, tests):
2951 for raw, errors, expected in tests:
2952 if expected is not None:
2953 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002954 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002955 except UnicodeDecodeError as err:
2956 self.fail('Unable to decode %a from "cp%s" with '
2957 'errors=%r: %s' % (raw, cp, errors, err))
2958 self.assertEqual(decoded[0], expected,
2959 '%a.decode("cp%s", %r)=%a != %a'
2960 % (raw, cp, errors, decoded[0], expected))
2961 # assert 0 <= decoded[1] <= len(raw)
2962 self.assertGreaterEqual(decoded[1], 0)
2963 self.assertLessEqual(decoded[1], len(raw))
2964 else:
2965 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002966 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002967
2968 def check_encode(self, cp, tests):
2969 for text, errors, expected in tests:
2970 if expected is not None:
2971 try:
2972 encoded = codecs.code_page_encode(cp, text, errors)
2973 except UnicodeEncodeError as err:
2974 self.fail('Unable to encode %a to "cp%s" with '
2975 'errors=%r: %s' % (text, cp, errors, err))
2976 self.assertEqual(encoded[0], expected,
2977 '%a.encode("cp%s", %r)=%a != %a'
2978 % (text, cp, errors, encoded[0], expected))
2979 self.assertEqual(encoded[1], len(text))
2980 else:
2981 self.assertRaises(UnicodeEncodeError,
2982 codecs.code_page_encode, cp, text, errors)
2983
2984 def test_cp932(self):
2985 self.check_encode(932, (
2986 ('abc', 'strict', b'abc'),
2987 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002988 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002989 ('\xff', 'strict', None),
2990 ('[\xff]', 'ignore', b'[]'),
2991 ('[\xff]', 'replace', b'[y]'),
2992 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002993 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002994 ('[\xff]', 'namereplace',
2995 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002996 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002997 ('\udcff', 'strict', None),
2998 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2999 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003000 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003001 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003002 (b'abc', 'strict', 'abc'),
3003 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3004 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003005 (b'[\xff]', 'strict', None),
3006 (b'[\xff]', 'ignore', '[]'),
3007 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003008 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003009 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003010 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003011 (b'\x81\x00abc', 'strict', None),
3012 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003013 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003014 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003015 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003016
3017 def test_cp1252(self):
3018 self.check_encode(1252, (
3019 ('abc', 'strict', b'abc'),
3020 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3021 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003022 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003023 ('\u0141', 'strict', None),
3024 ('\u0141', 'ignore', b''),
3025 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003026 ('\udc98', 'surrogateescape', b'\x98'),
3027 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003028 ))
3029 self.check_decode(1252, (
3030 (b'abc', 'strict', 'abc'),
3031 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3032 (b'\xff', 'strict', '\xff'),
3033 ))
3034
3035 def test_cp_utf7(self):
3036 cp = 65000
3037 self.check_encode(cp, (
3038 ('abc', 'strict', b'abc'),
3039 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3040 ('\U0010ffff', 'strict', b'+2//f/w-'),
3041 ('\udc80', 'strict', b'+3IA-'),
3042 ('\ufffd', 'strict', b'+//0-'),
3043 ))
3044 self.check_decode(cp, (
3045 (b'abc', 'strict', 'abc'),
3046 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3047 (b'+2//f/w-', 'strict', '\U0010ffff'),
3048 (b'+3IA-', 'strict', '\udc80'),
3049 (b'+//0-', 'strict', '\ufffd'),
3050 # invalid bytes
3051 (b'[+/]', 'strict', '[]'),
3052 (b'[\xff]', 'strict', '[\xff]'),
3053 ))
3054
Victor Stinner3a50e702011-10-18 21:21:00 +02003055 def test_multibyte_encoding(self):
3056 self.check_decode(932, (
3057 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3058 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3059 ))
3060 self.check_decode(self.CP_UTF8, (
3061 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3062 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3063 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003064 self.check_encode(self.CP_UTF8, (
3065 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3066 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3067 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003068
3069 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003070 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3071 self.assertEqual(decoded, ('', 0))
3072
Victor Stinner3a50e702011-10-18 21:21:00 +02003073 decoded = codecs.code_page_decode(932,
3074 b'\xe9\x80\xe9', 'strict',
3075 False)
3076 self.assertEqual(decoded, ('\u9a3e', 2))
3077
3078 decoded = codecs.code_page_decode(932,
3079 b'\xe9\x80\xe9\x80', 'strict',
3080 False)
3081 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3082
3083 decoded = codecs.code_page_decode(932,
3084 b'abc', 'strict',
3085 False)
3086 self.assertEqual(decoded, ('abc', 3))
3087
Steve Dowerf5aba582016-09-06 19:42:27 -07003088 def test_mbcs_alias(self):
3089 # Check that looking up our 'default' codepage will return
3090 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003091 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003092 codec = codecs.lookup('cp123')
3093 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003094
Serhiy Storchaka4013c172018-12-03 10:36:45 +02003095 @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3096 def test_large_input(self):
3097 # Test input longer than INT_MAX.
3098 # Input should contain undecodable bytes before and after
3099 # the INT_MAX limit.
3100 encoded = (b'01234567' * (2**28-1) +
3101 b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3102 self.assertEqual(len(encoded), 2**31+2)
3103 decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3104 self.assertEqual(decoded[1], len(encoded))
3105 del encoded
3106 self.assertEqual(len(decoded[0]), decoded[1])
3107 self.assertEqual(decoded[0][:10], '0123456701')
3108 self.assertEqual(decoded[0][-20:],
3109 '6701234567'
3110 '\udc85\udc86\udcea\udceb\udcec'
3111 '\udcef\udcfc\udcfd\udcfe\udcff')
3112
Victor Stinner3a50e702011-10-18 21:21:00 +02003113
Victor Stinnerf96418d2015-09-21 23:06:27 +02003114class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003115 def test_encode(self):
3116 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3117
3118 def test_encode_error(self):
3119 for data, error_handler, expected in (
3120 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3121 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3122 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003123 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3124 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003125 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3126 ):
3127 with self.subTest(data=data, error_handler=error_handler,
3128 expected=expected):
3129 self.assertEqual(data.encode('ascii', error_handler),
3130 expected)
3131
3132 def test_encode_surrogateescape_error(self):
3133 with self.assertRaises(UnicodeEncodeError):
3134 # the first character can be decoded, but not the second
3135 '\udc80\xff'.encode('ascii', 'surrogateescape')
3136
Victor Stinnerf96418d2015-09-21 23:06:27 +02003137 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003138 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3139
3140 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003141 for data, error_handler, expected in (
3142 (b'[\x80\xff]', 'ignore', '[]'),
3143 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3144 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3145 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3146 ):
3147 with self.subTest(data=data, error_handler=error_handler,
3148 expected=expected):
3149 self.assertEqual(data.decode('ascii', error_handler),
3150 expected)
3151
3152
Victor Stinnerc3713e92015-09-29 12:32:13 +02003153class Latin1Test(unittest.TestCase):
3154 def test_encode(self):
3155 for data, expected in (
3156 ('abc', b'abc'),
3157 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3158 ):
3159 with self.subTest(data=data, expected=expected):
3160 self.assertEqual(data.encode('latin1'), expected)
3161
3162 def test_encode_errors(self):
3163 for data, error_handler, expected in (
3164 ('[\u20ac\udc80]', 'ignore', b'[]'),
3165 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003166 ('[\u20ac\U000abcde]', 'backslashreplace',
3167 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003168 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3169 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3170 ):
3171 with self.subTest(data=data, error_handler=error_handler,
3172 expected=expected):
3173 self.assertEqual(data.encode('latin1', error_handler),
3174 expected)
3175
3176 def test_encode_surrogateescape_error(self):
3177 with self.assertRaises(UnicodeEncodeError):
3178 # the first character can be decoded, but not the second
3179 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3180
3181 def test_decode(self):
3182 for data, expected in (
3183 (b'abc', 'abc'),
3184 (b'[\x80\xff]', '[\x80\xff]'),
3185 ):
3186 with self.subTest(data=data, expected=expected):
3187 self.assertEqual(data.decode('latin1'), expected)
3188
3189
Victor Stinner3d4226a2018-08-29 22:21:32 +02003190@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3191class LocaleCodecTest(unittest.TestCase):
3192 """
3193 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3194 """
3195 ENCODING = sys.getfilesystemencoding()
3196 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3197 "u255:\xff",
3198 "UCS:\xe9\u20ac\U0010ffff",
3199 "surrogates:\uDC80\uDCFF")
3200 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3201 SURROGATES = "\uDC80\uDCFF"
3202
3203 def encode(self, text, errors="strict"):
3204 return _testcapi.EncodeLocaleEx(text, 0, errors)
3205
3206 def check_encode_strings(self, errors):
3207 for text in self.STRINGS:
3208 with self.subTest(text=text):
3209 try:
3210 expected = text.encode(self.ENCODING, errors)
3211 except UnicodeEncodeError:
3212 with self.assertRaises(RuntimeError) as cm:
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003213 self.encode(text, errors)
Victor Stinner3d4226a2018-08-29 22:21:32 +02003214 errmsg = str(cm.exception)
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003215 self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
Victor Stinner3d4226a2018-08-29 22:21:32 +02003216 else:
3217 encoded = self.encode(text, errors)
3218 self.assertEqual(encoded, expected)
3219
3220 def test_encode_strict(self):
3221 self.check_encode_strings("strict")
3222
3223 def test_encode_surrogateescape(self):
3224 self.check_encode_strings("surrogateescape")
3225
3226 def test_encode_surrogatepass(self):
3227 try:
3228 self.encode('', 'surrogatepass')
3229 except ValueError as exc:
3230 if str(exc) == 'unsupported error handler':
3231 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3232 f"surrogatepass error handler")
3233 else:
3234 raise
3235
3236 self.check_encode_strings("surrogatepass")
3237
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003238 def test_encode_unsupported_error_handler(self):
3239 with self.assertRaises(ValueError) as cm:
3240 self.encode('', 'backslashreplace')
3241 self.assertEqual(str(cm.exception), 'unsupported error handler')
3242
Victor Stinner3d4226a2018-08-29 22:21:32 +02003243 def decode(self, encoded, errors="strict"):
3244 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3245
3246 def check_decode_strings(self, errors):
3247 is_utf8 = (self.ENCODING == "utf-8")
3248 if is_utf8:
3249 encode_errors = 'surrogateescape'
3250 else:
3251 encode_errors = 'strict'
3252
3253 strings = list(self.BYTES_STRINGS)
3254 for text in self.STRINGS:
3255 try:
3256 encoded = text.encode(self.ENCODING, encode_errors)
3257 if encoded not in strings:
3258 strings.append(encoded)
3259 except UnicodeEncodeError:
3260 encoded = None
3261
3262 if is_utf8:
3263 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3264 if encoded2 != encoded:
3265 strings.append(encoded2)
3266
3267 for encoded in strings:
3268 with self.subTest(encoded=encoded):
3269 try:
3270 expected = encoded.decode(self.ENCODING, errors)
3271 except UnicodeDecodeError:
3272 with self.assertRaises(RuntimeError) as cm:
3273 self.decode(encoded, errors)
3274 errmsg = str(cm.exception)
3275 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3276 else:
3277 decoded = self.decode(encoded, errors)
3278 self.assertEqual(decoded, expected)
3279
3280 def test_decode_strict(self):
3281 self.check_decode_strings("strict")
3282
3283 def test_decode_surrogateescape(self):
3284 self.check_decode_strings("surrogateescape")
3285
3286 def test_decode_surrogatepass(self):
3287 try:
3288 self.decode(b'', 'surrogatepass')
3289 except ValueError as exc:
3290 if str(exc) == 'unsupported error handler':
3291 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3292 f"surrogatepass error handler")
3293 else:
3294 raise
3295
3296 self.check_decode_strings("surrogatepass")
3297
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003298 def test_decode_unsupported_error_handler(self):
3299 with self.assertRaises(ValueError) as cm:
3300 self.decode(b'', 'backslashreplace')
3301 self.assertEqual(str(cm.exception), 'unsupported error handler')
3302
Victor Stinner3d4226a2018-08-29 22:21:32 +02003303
Fred Drake2e2be372001-09-20 21:33:42 +00003304if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003305 unittest.main()