blob: 86d0dde1705763dca23cef25574c46290356732f [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
13 import ctypes
14except ImportError:
15 ctypes = None
16 SIZEOF_WCHAR_T = -1
17else:
18 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000019
Serhiy Storchakad6793772013-01-29 10:20:44 +020020def coding_checker(self, coder):
21 def check(input, expect):
22 self.assertEqual(coder(input), (expect, len(input)))
23 return check
24
Victor Stinnerf96418d2015-09-21 23:06:27 +020025
Walter Dörwald69652032004-09-07 20:24:22 +000026class Queue(object):
27 """
28 queue: write bytes at one end, read bytes from the other end
29 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000030 def __init__(self, buffer):
31 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000032
33 def write(self, chars):
34 self._buffer += chars
35
36 def read(self, size=-1):
37 if size<0:
38 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000040 return s
41 else:
42 s = self._buffer[:size]
43 self._buffer = self._buffer[size:]
44 return s
45
Victor Stinnerf96418d2015-09-21 23:06:27 +020046
Walter Dörwald3abcb012007-04-16 22:10:50 +000047class MixInCheckStateHandling:
48 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000049 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000050 d = codecs.getincrementaldecoder(encoding)()
51 part1 = d.decode(s[:i])
52 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000053 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 # Check that the condition stated in the documentation for
55 # IncrementalDecoder.getstate() holds
56 if not state[1]:
57 # reset decoder to the default state without anything buffered
58 d.setstate((state[0][:0], 0))
59 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000061 # The decoder must return to the same state
62 self.assertEqual(state, d.getstate())
63 # Create a new decoder and set it to the state
64 # we extracted from the old one
65 d = codecs.getincrementaldecoder(encoding)()
66 d.setstate(state)
67 part2 = d.decode(s[i:], True)
68 self.assertEqual(u, part1+part2)
69
70 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000071 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000072 d = codecs.getincrementalencoder(encoding)()
73 part1 = d.encode(u[:i])
74 state = d.getstate()
75 d = codecs.getincrementalencoder(encoding)()
76 d.setstate(state)
77 part2 = d.encode(u[i:], True)
78 self.assertEqual(s, part1+part2)
79
Victor Stinnerf96418d2015-09-21 23:06:27 +020080
Ezio Melotti5d3dba02013-01-11 06:02:07 +020081class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000083 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000084 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000085 # the StreamReader and check that the results equal the appropriate
86 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020088 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000092 result += r.read()
93 self.assertEqual(result, partialresult)
94 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000095 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000097
Martin Panter7462b6492015-11-02 03:37:02 +000098 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +000099 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000102 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 self.assertEqual(result, partialresult)
104 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 self.assertEqual(d.decode(b"", True), "")
106 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000107
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 self.assertEqual(result, partialresult)
114 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 self.assertEqual(d.decode(b"", True), "")
116 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000117
118 # check iterdecode()
119 encoded = input.encode(self.encoding)
120 self.assertEqual(
121 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000122 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000123 )
124
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125 def test_readline(self):
126 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000127 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 return codecs.getreader(self.encoding)(stream)
129
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200131 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 lines = []
133 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 if not line:
136 break
137 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000138 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000140 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
141 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
142 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 self.assertEqual(readalllines(s, True), sexpected)
144 self.assertEqual(readalllines(s, False), sexpectednoends)
145 self.assertEqual(readalllines(s, True, 10), sexpected)
146 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000147
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200148 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200152 for (i, lineend) in enumerate(lineends):
153 vw.append((i*200+200)*"\u3042" + lineend)
154 vwo.append((i*200+200)*"\u3042")
155 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
156 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200161 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200169 self.assertEqual(
170 reader.readline(keepends=True),
171 "xxx\n",
172 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000174 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 self.assertEqual(
176 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200179 self.assertEqual(
180 reader.readline(keepends=False),
181 "xxx",
182 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200184 def test_mixed_readline_and_read(self):
185 lines = ["Humpty Dumpty sat on a wall,\n",
186 "Humpty Dumpty had a great fall.\r\n",
187 "All the king's horses and all the king's men\r",
188 "Couldn't put Humpty together again."]
189 data = ''.join(lines)
190 def getreader():
191 stream = io.BytesIO(data.encode(self.encoding))
192 return codecs.getreader(self.encoding)(stream)
193
194 # Issue #8260: Test readline() followed by read()
195 f = getreader()
196 self.assertEqual(f.readline(), lines[0])
197 self.assertEqual(f.read(), ''.join(lines[1:]))
198 self.assertEqual(f.read(), '')
199
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200200 # Issue #32110: Test readline() followed by read(n)
201 f = getreader()
202 self.assertEqual(f.readline(), lines[0])
203 self.assertEqual(f.read(1), lines[1][0])
204 self.assertEqual(f.read(0), '')
205 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
206
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200207 # Issue #16636: Test readline() followed by readlines()
208 f = getreader()
209 self.assertEqual(f.readline(), lines[0])
210 self.assertEqual(f.readlines(), lines[1:])
211 self.assertEqual(f.read(), '')
212
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200213 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200214 f = getreader()
215 self.assertEqual(f.read(size=40, chars=5), data[:5])
216 self.assertEqual(f.read(), data[5:])
217 self.assertEqual(f.read(), '')
218
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200219 # Issue #32110: Test read(n) followed by read(n)
220 f = getreader()
221 self.assertEqual(f.read(size=40, chars=5), data[:5])
222 self.assertEqual(f.read(1), data[5])
223 self.assertEqual(f.read(0), '')
224 self.assertEqual(f.read(100), data[6:106])
225
226 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200227 f = getreader()
228 self.assertEqual(f.read(size=40, chars=5), data[:5])
229 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
230 self.assertEqual(f.read(), '')
231
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000232 def test_bug1175396(self):
233 s = [
234 '<%!--===================================================\r\n',
235 ' BLOG index page: show recent articles,\r\n',
236 ' today\'s articles, or articles of a specific date.\r\n',
237 '========================================================--%>\r\n',
238 '<%@inputencoding="ISO-8859-1"%>\r\n',
239 '<%@pagetemplate=TEMPLATE.y%>\r\n',
240 '<%@import=import frog.util, frog%>\r\n',
241 '<%@import=import frog.objects%>\r\n',
242 '<%@import=from frog.storageerrors import StorageError%>\r\n',
243 '<%\r\n',
244 '\r\n',
245 'import logging\r\n',
246 'log=logging.getLogger("Snakelets.logger")\r\n',
247 '\r\n',
248 '\r\n',
249 'user=self.SessionCtx.user\r\n',
250 'storageEngine=self.SessionCtx.storageEngine\r\n',
251 '\r\n',
252 '\r\n',
253 'def readArticlesFromDate(date, count=None):\r\n',
254 ' entryids=storageEngine.listBlogEntries(date)\r\n',
255 ' entryids.reverse() # descending\r\n',
256 ' if count:\r\n',
257 ' entryids=entryids[:count]\r\n',
258 ' try:\r\n',
259 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
260 ' except StorageError,x:\r\n',
261 ' log.error("Error loading articles: "+str(x))\r\n',
262 ' self.abort("cannot load articles")\r\n',
263 '\r\n',
264 'showdate=None\r\n',
265 '\r\n',
266 'arg=self.Request.getArg()\r\n',
267 'if arg=="today":\r\n',
268 ' #-------------------- TODAY\'S ARTICLES\r\n',
269 ' self.write("<h2>Today\'s articles</h2>")\r\n',
270 ' showdate = frog.util.isodatestr() \r\n',
271 ' entries = readArticlesFromDate(showdate)\r\n',
272 'elif arg=="active":\r\n',
273 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
274 ' self.Yredirect("active.y")\r\n',
275 'elif arg=="login":\r\n',
276 ' #-------------------- LOGIN PAGE redirect\r\n',
277 ' self.Yredirect("login.y")\r\n',
278 'elif arg=="date":\r\n',
279 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
280 ' showdate = self.Request.getParameter("date")\r\n',
281 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
282 ' entries = readArticlesFromDate(showdate)\r\n',
283 'else:\r\n',
284 ' #-------------------- RECENT ARTICLES\r\n',
285 ' self.write("<h2>Recent articles</h2>")\r\n',
286 ' dates=storageEngine.listBlogEntryDates()\r\n',
287 ' if dates:\r\n',
288 ' entries=[]\r\n',
289 ' SHOWAMOUNT=10\r\n',
290 ' for showdate in dates:\r\n',
291 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
292 ' if len(entries)>=SHOWAMOUNT:\r\n',
293 ' break\r\n',
294 ' \r\n',
295 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000296 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200297 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000298 for (i, line) in enumerate(reader):
299 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000300
301 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000302 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200303 writer = codecs.getwriter(self.encoding)(q)
304 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000307 writer.write("foo\r")
308 self.assertEqual(reader.readline(keepends=False), "foo")
309 writer.write("\nbar\r")
310 self.assertEqual(reader.readline(keepends=False), "")
311 self.assertEqual(reader.readline(keepends=False), "bar")
312 writer.write("baz")
313 self.assertEqual(reader.readline(keepends=False), "baz")
314 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
316 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 writer.write("foo\r")
318 self.assertEqual(reader.readline(keepends=True), "foo\r")
319 writer.write("\nbar\r")
320 self.assertEqual(reader.readline(keepends=True), "\n")
321 self.assertEqual(reader.readline(keepends=True), "bar\r")
322 writer.write("baz")
323 self.assertEqual(reader.readline(keepends=True), "baz")
324 self.assertEqual(reader.readline(keepends=True), "")
325 writer.write("foo\r\n")
326 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000327
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
330 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
331 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000332
333 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000334 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200335 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336 self.assertEqual(reader.readline(), s1)
337 self.assertEqual(reader.readline(), s2)
338 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000339 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000340
341 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
343 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
344 s3 = "stillokay:bbbbxx\r\n"
345 s4 = "broken!!!!badbad\r\n"
346 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000347
348 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000349 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200350 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000351 self.assertEqual(reader.readline(), s1)
352 self.assertEqual(reader.readline(), s2)
353 self.assertEqual(reader.readline(), s3)
354 self.assertEqual(reader.readline(), s4)
355 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000356 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200358 ill_formed_sequence_replace = "\ufffd"
359
360 def test_lone_surrogates(self):
361 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
362 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
363 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200364 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
365 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200366 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
367 "[&#56448;]".encode(self.encoding))
368 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
369 "[]".encode(self.encoding))
370 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
371 "[?]".encode(self.encoding))
372
Victor Stinner01ada392015-10-01 21:54:51 +0200373 # sequential surrogate characters
374 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
375 "[]".encode(self.encoding))
376 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
377 "[??]".encode(self.encoding))
378
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200379 bom = "".encode(self.encoding)
380 for before, after in [("\U00010fff", "A"), ("[", "]"),
381 ("A", "\U00010fff")]:
382 before_sequence = before.encode(self.encoding)[len(bom):]
383 after_sequence = after.encode(self.encoding)[len(bom):]
384 test_string = before + "\uDC80" + after
385 test_sequence = (bom + before_sequence +
386 self.ill_formed_sequence + after_sequence)
387 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
388 self.encoding)
389 self.assertEqual(test_string.encode(self.encoding,
390 "surrogatepass"),
391 test_sequence)
392 self.assertEqual(test_sequence.decode(self.encoding,
393 "surrogatepass"),
394 test_string)
395 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
396 before + after)
397 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
398 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200399 backslashreplace = ''.join('\\x%02x' % b
400 for b in self.ill_formed_sequence)
401 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
402 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200403
Victor Stinnerf96418d2015-09-21 23:06:27 +0200404
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200405class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200407 if sys.byteorder == 'little':
408 ill_formed_sequence = b"\x80\xdc\x00\x00"
409 else:
410 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000411
412 spamle = (b'\xff\xfe\x00\x00'
413 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
414 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
415 spambe = (b'\x00\x00\xfe\xff'
416 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
417 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
418
419 def test_only_one_bom(self):
420 _,_,reader,writer = codecs.lookup(self.encoding)
421 # encode some stream
422 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200423 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424 f.write("spam")
425 f.write("spam")
426 d = s.getvalue()
427 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000428 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000429 # try to read it back
430 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200431 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000432 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000433
434 def test_badbom(self):
435 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200436 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000437 self.assertRaises(UnicodeError, f.read)
438
439 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200440 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000441 self.assertRaises(UnicodeError, f.read)
442
443 def test_partial(self):
444 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200445 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 [
447 "", # first byte of BOM read
448 "", # second byte of BOM read
449 "", # third byte of BOM read
450 "", # fourth byte of BOM read => byteorder known
451 "",
452 "",
453 "",
454 "\x00",
455 "\x00",
456 "\x00",
457 "\x00",
458 "\x00\xff",
459 "\x00\xff",
460 "\x00\xff",
461 "\x00\xff",
462 "\x00\xff\u0100",
463 "\x00\xff\u0100",
464 "\x00\xff\u0100",
465 "\x00\xff\u0100",
466 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200467 "\x00\xff\u0100\uffff",
468 "\x00\xff\u0100\uffff",
469 "\x00\xff\u0100\uffff",
470 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000471 ]
472 )
473
Georg Brandl791f4e12009-09-17 11:41:24 +0000474 def test_handlers(self):
475 self.assertEqual(('\ufffd', 1),
476 codecs.utf_32_decode(b'\x01', 'replace', True))
477 self.assertEqual(('', 1),
478 codecs.utf_32_decode(b'\x01', 'ignore', True))
479
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480 def test_errors(self):
481 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
482 b"\xff", "strict", True)
483
484 def test_decoder_state(self):
485 self.check_state_handling_decode(self.encoding,
486 "spamspam", self.spamle)
487 self.check_state_handling_decode(self.encoding,
488 "spamspam", self.spambe)
489
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000490 def test_issue8941(self):
491 # Issue #8941: insufficient result allocation when decoding into
492 # surrogate pairs on UCS-2 builds.
493 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
494 self.assertEqual('\U00010000' * 1024,
495 codecs.utf_32_decode(encoded_le)[0])
496 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
497 self.assertEqual('\U00010000' * 1024,
498 codecs.utf_32_decode(encoded_be)[0])
499
Victor Stinnerf96418d2015-09-21 23:06:27 +0200500
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200501class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000502 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200503 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000504
505 def test_partial(self):
506 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200507 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000508 [
509 "",
510 "",
511 "",
512 "\x00",
513 "\x00",
514 "\x00",
515 "\x00",
516 "\x00\xff",
517 "\x00\xff",
518 "\x00\xff",
519 "\x00\xff",
520 "\x00\xff\u0100",
521 "\x00\xff\u0100",
522 "\x00\xff\u0100",
523 "\x00\xff\u0100",
524 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200525 "\x00\xff\u0100\uffff",
526 "\x00\xff\u0100\uffff",
527 "\x00\xff\u0100\uffff",
528 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529 ]
530 )
531
532 def test_simple(self):
533 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
534
535 def test_errors(self):
536 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
537 b"\xff", "strict", True)
538
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000539 def test_issue8941(self):
540 # Issue #8941: insufficient result allocation when decoding into
541 # surrogate pairs on UCS-2 builds.
542 encoded = b'\x00\x00\x01\x00' * 1024
543 self.assertEqual('\U00010000' * 1024,
544 codecs.utf_32_le_decode(encoded)[0])
545
Victor Stinnerf96418d2015-09-21 23:06:27 +0200546
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200547class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000548 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200549 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000550
551 def test_partial(self):
552 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200553 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000554 [
555 "",
556 "",
557 "",
558 "\x00",
559 "\x00",
560 "\x00",
561 "\x00",
562 "\x00\xff",
563 "\x00\xff",
564 "\x00\xff",
565 "\x00\xff",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100",
568 "\x00\xff\u0100",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200571 "\x00\xff\u0100\uffff",
572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000575 ]
576 )
577
578 def test_simple(self):
579 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
580
581 def test_errors(self):
582 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
583 b"\xff", "strict", True)
584
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000585 def test_issue8941(self):
586 # Issue #8941: insufficient result allocation when decoding into
587 # surrogate pairs on UCS-2 builds.
588 encoded = b'\x00\x01\x00\x00' * 1024
589 self.assertEqual('\U00010000' * 1024,
590 codecs.utf_32_be_decode(encoded)[0])
591
592
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200593class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200595 if sys.byteorder == 'little':
596 ill_formed_sequence = b"\x80\xdc"
597 else:
598 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000599
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000600 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
601 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000602
603 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000604 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000605 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000606 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200607 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000608 f.write("spam")
609 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000610 d = s.getvalue()
611 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000612 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000613 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000614 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200615 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000616 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000617
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000618 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000619 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200620 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000621 self.assertRaises(UnicodeError, f.read)
622
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000623 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200624 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000625 self.assertRaises(UnicodeError, f.read)
626
Walter Dörwald69652032004-09-07 20:24:22 +0000627 def test_partial(self):
628 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200629 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000631 "", # first byte of BOM read
632 "", # second byte of BOM read => byteorder known
633 "",
634 "\x00",
635 "\x00",
636 "\x00\xff",
637 "\x00\xff",
638 "\x00\xff\u0100",
639 "\x00\xff\u0100",
640 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200641 "\x00\xff\u0100\uffff",
642 "\x00\xff\u0100\uffff",
643 "\x00\xff\u0100\uffff",
644 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000645 ]
646 )
647
Georg Brandl791f4e12009-09-17 11:41:24 +0000648 def test_handlers(self):
649 self.assertEqual(('\ufffd', 1),
650 codecs.utf_16_decode(b'\x01', 'replace', True))
651 self.assertEqual(('', 1),
652 codecs.utf_16_decode(b'\x01', 'ignore', True))
653
Walter Dörwalde22d3392005-11-17 08:52:34 +0000654 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000655 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000656 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000657
658 def test_decoder_state(self):
659 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000661 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000663
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000664 def test_bug691291(self):
665 # Files are always opened in binary mode, even if no binary mode was
666 # specified. This means that no automatic conversion of '\n' is done
667 # on reading and writing.
668 s1 = 'Hello\r\nworld\r\n'
669
670 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200671 self.addCleanup(support.unlink, support.TESTFN)
672 with open(support.TESTFN, 'wb') as fp:
673 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200674 with support.check_warnings(('', DeprecationWarning)):
675 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
676 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200677 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000678
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200679class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000680 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200681 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000682
683 def test_partial(self):
684 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000687 "",
688 "\x00",
689 "\x00",
690 "\x00\xff",
691 "\x00\xff",
692 "\x00\xff\u0100",
693 "\x00\xff\u0100",
694 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200695 "\x00\xff\u0100\uffff",
696 "\x00\xff\u0100\uffff",
697 "\x00\xff\u0100\uffff",
698 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000699 ]
700 )
701
Walter Dörwalde22d3392005-11-17 08:52:34 +0000702 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200703 tests = [
704 (b'\xff', '\ufffd'),
705 (b'A\x00Z', 'A\ufffd'),
706 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
707 (b'\x00\xd8', '\ufffd'),
708 (b'\x00\xd8A', '\ufffd'),
709 (b'\x00\xd8A\x00', '\ufffdA'),
710 (b'\x00\xdcA\x00', '\ufffdA'),
711 ]
712 for raw, expected in tests:
713 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
714 raw, 'strict', True)
715 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000716
Victor Stinner53a9dd72010-12-08 22:25:45 +0000717 def test_nonbmp(self):
718 self.assertEqual("\U00010203".encode(self.encoding),
719 b'\x00\xd8\x03\xde')
720 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
721 "\U00010203")
722
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200723class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000724 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200725 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000726
727 def test_partial(self):
728 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200729 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000730 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000731 "",
732 "\x00",
733 "\x00",
734 "\x00\xff",
735 "\x00\xff",
736 "\x00\xff\u0100",
737 "\x00\xff\u0100",
738 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200739 "\x00\xff\u0100\uffff",
740 "\x00\xff\u0100\uffff",
741 "\x00\xff\u0100\uffff",
742 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000743 ]
744 )
745
Walter Dörwalde22d3392005-11-17 08:52:34 +0000746 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200747 tests = [
748 (b'\xff', '\ufffd'),
749 (b'\x00A\xff', 'A\ufffd'),
750 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
751 (b'\xd8\x00', '\ufffd'),
752 (b'\xd8\x00\xdc', '\ufffd'),
753 (b'\xd8\x00\x00A', '\ufffdA'),
754 (b'\xdc\x00\x00A', '\ufffdA'),
755 ]
756 for raw, expected in tests:
757 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
758 raw, 'strict', True)
759 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000760
Victor Stinner53a9dd72010-12-08 22:25:45 +0000761 def test_nonbmp(self):
762 self.assertEqual("\U00010203".encode(self.encoding),
763 b'\xd8\x00\xde\x03')
764 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
765 "\U00010203")
766
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200767class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000768 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200769 ill_formed_sequence = b"\xed\xb2\x80"
770 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200771 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000772
773 def test_partial(self):
774 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200775 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000776 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000777 "\x00",
778 "\x00",
779 "\x00\xff",
780 "\x00\xff",
781 "\x00\xff\u07ff",
782 "\x00\xff\u07ff",
783 "\x00\xff\u07ff",
784 "\x00\xff\u07ff\u0800",
785 "\x00\xff\u07ff\u0800",
786 "\x00\xff\u07ff\u0800",
787 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200788 "\x00\xff\u07ff\u0800\uffff",
789 "\x00\xff\u07ff\u0800\uffff",
790 "\x00\xff\u07ff\u0800\uffff",
791 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000792 ]
793 )
794
Walter Dörwald3abcb012007-04-16 22:10:50 +0000795 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000796 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000797 self.check_state_handling_decode(self.encoding,
798 u, u.encode(self.encoding))
799
Victor Stinner1d65d912015-10-05 13:43:50 +0200800 def test_decode_error(self):
801 for data, error_handler, expected in (
802 (b'[\x80\xff]', 'ignore', '[]'),
803 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
804 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
805 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
806 ):
807 with self.subTest(data=data, error_handler=error_handler,
808 expected=expected):
809 self.assertEqual(data.decode(self.encoding, error_handler),
810 expected)
811
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000812 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200813 super().test_lone_surrogates()
814 # not sure if this is making sense for
815 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200816 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
817 self.BOM + b'[\x80]')
818
819 with self.assertRaises(UnicodeEncodeError) as cm:
820 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
821 exc = cm.exception
822 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000823
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000824 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200825 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
826 self.BOM + b"abc\xed\xa0\x80def")
827 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
828 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
829 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
830 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
831
832 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000833 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200834 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200836
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000837 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700838 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200839 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200840 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200841 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000842
Victor Stinnerf96418d2015-09-21 23:06:27 +0200843
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200844@unittest.skipUnless(sys.platform == 'win32',
845 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200846class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200847 encoding = "cp65001"
848
849 def test_encode(self):
850 tests = [
851 ('abc', 'strict', b'abc'),
852 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
853 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700854 ('\udc80', 'strict', None),
855 ('\udc80', 'ignore', b''),
856 ('\udc80', 'replace', b'?'),
857 ('\udc80', 'backslashreplace', b'\\udc80'),
858 ('\udc80', 'namereplace', b'\\udc80'),
859 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200860 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200861 for text, errors, expected in tests:
862 if expected is not None:
863 try:
864 encoded = text.encode('cp65001', errors)
865 except UnicodeEncodeError as err:
866 self.fail('Unable to encode %a to cp65001 with '
867 'errors=%r: %s' % (text, errors, err))
868 self.assertEqual(encoded, expected,
869 '%a.encode("cp65001", %r)=%a != %a'
870 % (text, errors, encoded, expected))
871 else:
872 self.assertRaises(UnicodeEncodeError,
873 text.encode, "cp65001", errors)
874
875 def test_decode(self):
876 tests = [
877 (b'abc', 'strict', 'abc'),
878 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
879 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
880 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
881 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
882 # invalid bytes
883 (b'[\xff]', 'strict', None),
884 (b'[\xff]', 'ignore', '[]'),
885 (b'[\xff]', 'replace', '[\ufffd]'),
886 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700887 (b'[\xed\xb2\x80]', 'strict', None),
888 (b'[\xed\xb2\x80]', 'ignore', '[]'),
889 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200890 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200891 for raw, errors, expected in tests:
892 if expected is not None:
893 try:
894 decoded = raw.decode('cp65001', errors)
895 except UnicodeDecodeError as err:
896 self.fail('Unable to decode %a from cp65001 with '
897 'errors=%r: %s' % (raw, errors, err))
898 self.assertEqual(decoded, expected,
899 '%a.decode("cp65001", %r)=%a != %a'
900 % (raw, errors, decoded, expected))
901 else:
902 self.assertRaises(UnicodeDecodeError,
903 raw.decode, 'cp65001', errors)
904
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200905 def test_lone_surrogates(self):
906 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
907 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
908 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
909 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200910 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
911 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200912 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
913 b'[&#56448;]')
914 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
915 b'[\x80]')
916 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
917 b'[]')
918 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
919 b'[?]')
920
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200921 def test_surrogatepass_handler(self):
922 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
923 b"abc\xed\xa0\x80def")
924 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
925 "abc\ud800def")
926 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
927 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
928 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
929 "\U00010fff\uD800")
930 self.assertTrue(codecs.lookup_error("surrogatepass"))
931
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200932
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200933class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000934 encoding = "utf-7"
935
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300936 def test_ascii(self):
937 # Set D (directly encoded characters)
938 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
939 'abcdefghijklmnopqrstuvwxyz'
940 '0123456789'
941 '\'(),-./:?')
942 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
943 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
944 # Set O (optional direct characters)
945 set_o = ' !"#$%&*;<=>@[]^_`{|}'
946 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
947 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
948 # +
949 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
950 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
951 # White spaces
952 ws = ' \t\n\r'
953 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
954 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
955 # Other ASCII characters
956 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
957 set(set_d + set_o + '+' + ws)))
958 self.assertEqual(other_ascii.encode(self.encoding),
959 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
960 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
961
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000962 def test_partial(self):
963 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200964 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000965 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200966 'a',
967 'a',
968 'a+',
969 'a+-',
970 'a+-b',
971 'a+-b',
972 'a+-b',
973 'a+-b',
974 'a+-b',
975 'a+-b\x00',
976 'a+-b\x00c',
977 'a+-b\x00c',
978 'a+-b\x00c',
979 'a+-b\x00c',
980 'a+-b\x00c',
981 'a+-b\x00c\x80',
982 'a+-b\x00c\x80d',
983 'a+-b\x00c\x80d',
984 'a+-b\x00c\x80d',
985 'a+-b\x00c\x80d',
986 'a+-b\x00c\x80d',
987 'a+-b\x00c\x80d\u0100',
988 'a+-b\x00c\x80d\u0100e',
989 'a+-b\x00c\x80d\u0100e',
990 'a+-b\x00c\x80d\u0100e',
991 'a+-b\x00c\x80d\u0100e',
992 'a+-b\x00c\x80d\u0100e',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e\U00010000',
997 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000998 ]
999 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001000
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001001 def test_errors(self):
1002 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001003 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001004 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001005 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001006 (b'a+IK', 'a\ufffd'),
1007 (b'a+IK-b', 'a\ufffdb'),
1008 (b'a+IK,b', 'a\ufffdb'),
1009 (b'a+IKx', 'a\u20ac\ufffd'),
1010 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1011 (b'a+IKwgr', 'a\u20ac\ufffd'),
1012 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1013 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1014 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1015 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1016 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1017 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1018 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1019 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1020 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001021 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1022 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001023 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001024 ]
1025 for raw, expected in tests:
1026 with self.subTest(raw=raw):
1027 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1028 raw, 'strict', True)
1029 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1030
1031 def test_nonbmp(self):
1032 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1033 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1034 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001035 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1036 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1037 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1038 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1039 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1040 b'+IKwgrNgB3KA-')
1041 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1042 '\u20ac\u20ac\U000104A0')
1043 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1044 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001045
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001046 def test_lone_surrogates(self):
1047 tests = [
1048 (b'a+2AE-b', 'a\ud801b'),
1049 (b'a+2AE\xffb', 'a\ufffdb'),
1050 (b'a+2AE', 'a\ufffd'),
1051 (b'a+2AEA-b', 'a\ufffdb'),
1052 (b'a+2AH-b', 'a\ufffdb'),
1053 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1054 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1055 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1056 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1057 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1058 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1059 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1060 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1061 ]
1062 for raw, expected in tests:
1063 with self.subTest(raw=raw):
1064 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001065
1066
Walter Dörwalde22d3392005-11-17 08:52:34 +00001067class UTF16ExTest(unittest.TestCase):
1068
1069 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001070 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001071
1072 def test_bad_args(self):
1073 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1074
1075class ReadBufferTest(unittest.TestCase):
1076
1077 def test_array(self):
1078 import array
1079 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001080 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001081 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001082 )
1083
1084 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001085 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001086
1087 def test_bad_args(self):
1088 self.assertRaises(TypeError, codecs.readbuffer_encode)
1089 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1090
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001091class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001092 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001093 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001094
1095 def test_partial(self):
1096 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001097 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001098 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001099 "",
1100 "",
1101 "", # First BOM has been read and skipped
1102 "",
1103 "",
1104 "\ufeff", # Second BOM has been read and emitted
1105 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001106 "\ufeff\x00", # First byte of encoded "\xff" read
1107 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1108 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1109 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001110 "\ufeff\x00\xff\u07ff",
1111 "\ufeff\x00\xff\u07ff",
1112 "\ufeff\x00\xff\u07ff\u0800",
1113 "\ufeff\x00\xff\u07ff\u0800",
1114 "\ufeff\x00\xff\u07ff\u0800",
1115 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001116 "\ufeff\x00\xff\u07ff\u0800\uffff",
1117 "\ufeff\x00\xff\u07ff\u0800\uffff",
1118 "\ufeff\x00\xff\u07ff\u0800\uffff",
1119 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001120 ]
1121 )
1122
Thomas Wouters89f507f2006-12-13 04:49:30 +00001123 def test_bug1601501(self):
1124 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001125 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001126
Walter Dörwald3abcb012007-04-16 22:10:50 +00001127 def test_bom(self):
1128 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001129 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001130 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1131
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001132 def test_stream_bom(self):
1133 unistring = "ABC\u00A1\u2200XYZ"
1134 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1135
1136 reader = codecs.getreader("utf-8-sig")
1137 for sizehint in [None] + list(range(1, 11)) + \
1138 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001139 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001140 ostream = io.StringIO()
1141 while 1:
1142 if sizehint is not None:
1143 data = istream.read(sizehint)
1144 else:
1145 data = istream.read()
1146
1147 if not data:
1148 break
1149 ostream.write(data)
1150
1151 got = ostream.getvalue()
1152 self.assertEqual(got, unistring)
1153
1154 def test_stream_bare(self):
1155 unistring = "ABC\u00A1\u2200XYZ"
1156 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1157
1158 reader = codecs.getreader("utf-8-sig")
1159 for sizehint in [None] + list(range(1, 11)) + \
1160 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001161 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001162 ostream = io.StringIO()
1163 while 1:
1164 if sizehint is not None:
1165 data = istream.read(sizehint)
1166 else:
1167 data = istream.read()
1168
1169 if not data:
1170 break
1171 ostream.write(data)
1172
1173 got = ostream.getvalue()
1174 self.assertEqual(got, unistring)
1175
1176class EscapeDecodeTest(unittest.TestCase):
1177 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001178 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001179 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001180
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001181 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001182 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001183 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001184 b = bytes([b])
1185 if b != b'\\':
1186 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001187
1188 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001189 decode = codecs.escape_decode
1190 check = coding_checker(self, decode)
1191 check(b"[\\\n]", b"[]")
1192 check(br'[\"]', b'["]')
1193 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001194 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001195 check(br"[\a]", b"[\x07]")
1196 check(br"[\b]", b"[\x08]")
1197 check(br"[\t]", b"[\x09]")
1198 check(br"[\n]", b"[\x0a]")
1199 check(br"[\v]", b"[\x0b]")
1200 check(br"[\f]", b"[\x0c]")
1201 check(br"[\r]", b"[\x0d]")
1202 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001203 check(br"[\78]", b"[\x078]")
1204 check(br"[\41]", b"[!]")
1205 check(br"[\418]", b"[!8]")
1206 check(br"[\101]", b"[A]")
1207 check(br"[\1010]", b"[A0]")
1208 check(br"[\501]", b"[A]")
1209 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001210 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001211 for i in range(97, 123):
1212 b = bytes([i])
1213 if b not in b'abfnrtvx':
1214 with self.assertWarns(DeprecationWarning):
1215 check(b"\\" + b, b"\\" + b)
1216 with self.assertWarns(DeprecationWarning):
1217 check(b"\\" + b.upper(), b"\\" + b.upper())
1218 with self.assertWarns(DeprecationWarning):
1219 check(br"\8", b"\\8")
1220 with self.assertWarns(DeprecationWarning):
1221 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001222 with self.assertWarns(DeprecationWarning):
1223 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001224
1225 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001226 decode = codecs.escape_decode
1227 self.assertRaises(ValueError, decode, br"\x")
1228 self.assertRaises(ValueError, decode, br"[\x]")
1229 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1230 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1231 self.assertRaises(ValueError, decode, br"\x0")
1232 self.assertRaises(ValueError, decode, br"[\x0]")
1233 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1234 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001235
Victor Stinnerf96418d2015-09-21 23:06:27 +02001236
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001237class RecodingTest(unittest.TestCase):
1238 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001239 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001240 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001242 f2.close()
1243 # Python used to crash on this at exit because of a refcount
1244 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001245
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001246 self.assertTrue(f.closed)
1247
Martin v. Löwis2548c732003-04-18 10:39:54 +00001248# From RFC 3492
1249punycode_testcases = [
1250 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001251 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1252 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001253 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001258 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001259 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001261 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1262 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1263 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001264 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001266 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1267 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1268 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001269 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001271 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001272 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1273 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1274 "\u0939\u0948\u0902",
1275 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276
1277 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001278 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001279 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1280 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281
1282 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001283 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1284 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1285 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001286 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1287 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001288
1289 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001290 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1291 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1292 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1293 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001294 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001295
1296 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001297 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1298 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1299 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1300 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1301 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001302 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303
1304 # (K) Vietnamese:
1305 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1306 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001307 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1308 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1309 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1310 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001311 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001312
Martin v. Löwis2548c732003-04-18 10:39:54 +00001313 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001315 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001316
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1319 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1320 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001321 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001322
1323 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001324 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1325 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1326 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001327 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328
1329 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001330 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001331 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332
1333 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001334 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1335 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001336 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337
1338 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001339 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001340 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341
1342 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001343 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001344 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345
1346 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001347 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1348 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001349 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350 ]
1351
1352for i in punycode_testcases:
1353 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001354 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355
Victor Stinnerf96418d2015-09-21 23:06:27 +02001356
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357class PunycodeTest(unittest.TestCase):
1358 def test_encode(self):
1359 for uni, puny in punycode_testcases:
1360 # Need to convert both strings to lower case, since
1361 # some of the extended encodings use upper case, but our
1362 # code produces only lower case. Converting just puny to
1363 # lower is also insufficient, since some of the input characters
1364 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001365 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001366 str(uni.encode("punycode"), "ascii").lower(),
1367 str(puny, "ascii").lower()
1368 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369
1370 def test_decode(self):
1371 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001372 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001373 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001374 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375
Victor Stinnerf96418d2015-09-21 23:06:27 +02001376
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001377class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001378 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001379 def test_bug1251300(self):
1380 # Decoding with unicode_internal used to not correctly handle "code
1381 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001382 ok = [
1383 (b"\x00\x10\xff\xff", "\U0010ffff"),
1384 (b"\x00\x00\x01\x01", "\U00000101"),
1385 (b"", ""),
1386 ]
1387 not_ok = [
1388 b"\x7f\xff\xff\xff",
1389 b"\x80\x00\x00\x00",
1390 b"\x81\x00\x00\x00",
1391 b"\x00",
1392 b"\x00\x00\x00\x00\x00",
1393 ]
1394 for internal, uni in ok:
1395 if sys.byteorder == "little":
1396 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001397 with support.check_warnings():
1398 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001399 for internal in not_ok:
1400 if sys.byteorder == "little":
1401 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001402 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001403 'deprecated', DeprecationWarning)):
1404 self.assertRaises(UnicodeDecodeError, internal.decode,
1405 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001406 if sys.byteorder == "little":
1407 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001408 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001409 else:
1410 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001411 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001412 with support.check_warnings():
1413 self.assertRaises(UnicodeDecodeError,
1414 invalid.decode, "unicode_internal")
1415 with support.check_warnings():
1416 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1417 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001418 with support.check_warnings():
1419 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1420 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001421
Victor Stinner182d90d2011-09-29 19:53:55 +02001422 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001423 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001424 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001425 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001426 'deprecated', DeprecationWarning)):
1427 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001428 except UnicodeDecodeError as ex:
1429 self.assertEqual("unicode_internal", ex.encoding)
1430 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1431 self.assertEqual(4, ex.start)
1432 self.assertEqual(8, ex.end)
1433 else:
1434 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001435
Victor Stinner182d90d2011-09-29 19:53:55 +02001436 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001437 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001438 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1439 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001440 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001441 'deprecated', DeprecationWarning)):
1442 ab = "ab".encode("unicode_internal").decode()
1443 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1444 "ascii"),
1445 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001446 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001447
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001448 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001449 with support.check_warnings(('unicode_internal codec has been '
1450 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001451 # Issue 3739
1452 encoder = codecs.getencoder("unicode_internal")
1453 self.assertEqual(encoder("a")[1], 1)
1454 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1455
1456 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001457
Martin v. Löwis2548c732003-04-18 10:39:54 +00001458# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1459nameprep_tests = [
1460 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1462 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1463 b'\xb8\x8f\xef\xbb\xbf',
1464 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001466 (b'CAFE',
1467 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 # 3.3 Case folding 8bit U+00DF (german sharp s).
1469 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xc3\x9f',
1471 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xc4\xb0',
1474 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xc5\x83\xcd\xba',
1477 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1479 # XXX: skip this as it fails in UCS-2 mode
1480 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1481 # 'telc\xe2\x88\x95kg\xcf\x83'),
1482 (None, None),
1483 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1485 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'\xe1\xbe\xb7',
1488 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 # 3.9 Self-reverting case folding U+01F0 and normalization.
1490 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'\xc7\xb0',
1492 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001493 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'\xce\x90',
1495 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001496 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001497 (b'\xce\xb0',
1498 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 (b'\xe1\xba\x96',
1501 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001502 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001503 (b'\xe1\xbd\x96',
1504 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001505 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001506 (b' ',
1507 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001508 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001509 (b'\xc2\xa0',
1510 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001511 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001512 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 None),
1514 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001515 (b'\xe2\x80\x80',
1516 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001517 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001518 (b'\xe2\x80\x8b',
1519 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001521 (b'\xe3\x80\x80',
1522 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001523 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001524 (b'\x10\x7f',
1525 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001526 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001527 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001528 None),
1529 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001530 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531 None),
1532 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001533 (b'\xef\xbb\xbf',
1534 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001535 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001536 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001537 None),
1538 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001539 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001540 None),
1541 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001542 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001543 None),
1544 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001545 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001546 None),
1547 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001548 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001549 None),
1550 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001551 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001552 None),
1553 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001554 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001555 None),
1556 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001557 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001558 None),
1559 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001560 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001561 None),
1562 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001563 (b'\xcd\x81',
1564 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001565 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001566 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001567 None),
1568 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001569 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001570 None),
1571 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001572 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001573 None),
1574 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001575 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001576 None),
1577 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001578 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001579 None),
1580 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001581 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001582 None),
1583 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001584 (b'foo\xef\xb9\xb6bar',
1585 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001586 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001587 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001588 None),
1589 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001590 (b'\xd8\xa71\xd8\xa8',
1591 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001592 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001593 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001594 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001595 # None),
1596 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001597 # 3.44 Larger test (shrinking).
1598 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001599 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1600 b'\xaa\xce\xb0\xe2\x80\x80',
1601 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001602 # 3.45 Larger test (expanding).
1603 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001604 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1605 b'\x80',
1606 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1607 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1608 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001609 ]
1610
1611
1612class NameprepTest(unittest.TestCase):
1613 def test_nameprep(self):
1614 from encodings.idna import nameprep
1615 for pos, (orig, prepped) in enumerate(nameprep_tests):
1616 if orig is None:
1617 # Skipped
1618 continue
1619 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001620 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001621 if prepped is None:
1622 # Input contains prohibited characters
1623 self.assertRaises(UnicodeError, nameprep, orig)
1624 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001625 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001626 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001627 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001628 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001629 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001630
Victor Stinnerf96418d2015-09-21 23:06:27 +02001631
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001632class IDNACodecTest(unittest.TestCase):
1633 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001634 self.assertEqual(str(b"python.org", "idna"), "python.org")
1635 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1636 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1637 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001638
1639 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001640 self.assertEqual("python.org".encode("idna"), b"python.org")
1641 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1642 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1643 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001644
Martin v. Löwis8b595142005-08-25 11:03:38 +00001645 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001646 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001647 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001648 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001649
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001650 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001651 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001652 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653 "python.org"
1654 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001655 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001656 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001657 "python.org."
1658 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001659 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001660 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001661 "pyth\xf6n.org."
1662 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001663 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001664 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001665 "pyth\xf6n.org."
1666 )
1667
1668 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001669 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1670 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1671 self.assertEqual(decoder.decode(b"rg"), "")
1672 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001673
1674 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001675 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1676 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1677 self.assertEqual(decoder.decode(b"rg."), "org.")
1678 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001679
1680 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001681 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001682 b"".join(codecs.iterencode("python.org", "idna")),
1683 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001684 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001685 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001686 b"".join(codecs.iterencode("python.org.", "idna")),
1687 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001688 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001689 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001690 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1691 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001692 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001693 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001694 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1695 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001696 )
1697
1698 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001699 self.assertEqual(encoder.encode("\xe4x"), b"")
1700 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1701 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001702
1703 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001704 self.assertEqual(encoder.encode("\xe4x"), b"")
1705 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1706 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001707
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001708 def test_errors(self):
1709 """Only supports "strict" error handler"""
1710 "python.org".encode("idna", "strict")
1711 b"python.org".decode("idna", "strict")
1712 for errors in ("ignore", "replace", "backslashreplace",
1713 "surrogateescape"):
1714 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1715 self.assertRaises(Exception,
1716 b"python.org".decode, "idna", errors)
1717
Victor Stinnerf96418d2015-09-21 23:06:27 +02001718
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001719class CodecsModuleTest(unittest.TestCase):
1720
1721 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001722 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1723 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001724 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001725 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001726 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001727
Victor Stinnera57dfd02014-05-14 17:13:14 +02001728 # test keywords
1729 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1730 '\xe4\xf6\xfc')
1731 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1732 '[]')
1733
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001734 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001735 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1736 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001737 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001738 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001739 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001740 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001741
Victor Stinnera57dfd02014-05-14 17:13:14 +02001742 # test keywords
1743 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1744 b'\xe4\xf6\xfc')
1745 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1746 b'[]')
1747
Walter Dörwald063e1e82004-10-28 13:04:26 +00001748 def test_register(self):
1749 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001750 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001751
1752 def test_lookup(self):
1753 self.assertRaises(TypeError, codecs.lookup)
1754 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001755 self.assertRaises(LookupError, codecs.lookup, " ")
1756
1757 def test_getencoder(self):
1758 self.assertRaises(TypeError, codecs.getencoder)
1759 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1760
1761 def test_getdecoder(self):
1762 self.assertRaises(TypeError, codecs.getdecoder)
1763 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1764
1765 def test_getreader(self):
1766 self.assertRaises(TypeError, codecs.getreader)
1767 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1768
1769 def test_getwriter(self):
1770 self.assertRaises(TypeError, codecs.getwriter)
1771 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001772
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001773 def test_lookup_issue1813(self):
1774 # Issue #1813: under Turkish locales, lookup of some codecs failed
1775 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001776 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001777 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1778 try:
1779 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1780 except locale.Error:
1781 # Unsupported locale on this system
1782 self.skipTest('test needs Turkish locale')
1783 c = codecs.lookup('ASCII')
1784 self.assertEqual(c.name, 'ascii')
1785
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001786 def test_all(self):
1787 api = (
1788 "encode", "decode",
1789 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1790 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1791 "getencoder", "getdecoder", "getincrementalencoder",
1792 "getincrementaldecoder", "getreader", "getwriter",
1793 "register_error", "lookup_error",
1794 "strict_errors", "replace_errors", "ignore_errors",
1795 "xmlcharrefreplace_errors", "backslashreplace_errors",
1796 "namereplace_errors",
1797 "open", "EncodedFile",
1798 "iterencode", "iterdecode",
1799 "BOM", "BOM_BE", "BOM_LE",
1800 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1801 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1802 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1803 "StreamReaderWriter", "StreamRecoder",
1804 )
1805 self.assertCountEqual(api, codecs.__all__)
1806 for api in codecs.__all__:
1807 getattr(codecs, api)
1808
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001809 def test_open(self):
1810 self.addCleanup(support.unlink, support.TESTFN)
1811 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1812 with self.subTest(mode), \
1813 codecs.open(support.TESTFN, mode, 'ascii') as file:
1814 self.assertIsInstance(file, codecs.StreamReaderWriter)
1815
1816 def test_undefined(self):
1817 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1818 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1819 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1820 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1821 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1822 self.assertRaises(UnicodeError,
1823 codecs.encode, 'abc', 'undefined', errors)
1824 self.assertRaises(UnicodeError,
1825 codecs.decode, b'abc', 'undefined', errors)
1826
Victor Stinnerf96418d2015-09-21 23:06:27 +02001827
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001828class StreamReaderTest(unittest.TestCase):
1829
1830 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001831 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001832 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001833
1834 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001835 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001836 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001837
Victor Stinnerf96418d2015-09-21 23:06:27 +02001838
Thomas Wouters89f507f2006-12-13 04:49:30 +00001839class EncodedFileTest(unittest.TestCase):
1840
1841 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001842 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001843 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001844 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001845
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001846 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001847 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001848 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001849 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001850
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001851all_unicode_encodings = [
1852 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001853 "big5",
1854 "big5hkscs",
1855 "charmap",
1856 "cp037",
1857 "cp1006",
1858 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001859 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001860 "cp1140",
1861 "cp1250",
1862 "cp1251",
1863 "cp1252",
1864 "cp1253",
1865 "cp1254",
1866 "cp1255",
1867 "cp1256",
1868 "cp1257",
1869 "cp1258",
1870 "cp424",
1871 "cp437",
1872 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001873 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001874 "cp737",
1875 "cp775",
1876 "cp850",
1877 "cp852",
1878 "cp855",
1879 "cp856",
1880 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001881 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001882 "cp860",
1883 "cp861",
1884 "cp862",
1885 "cp863",
1886 "cp864",
1887 "cp865",
1888 "cp866",
1889 "cp869",
1890 "cp874",
1891 "cp875",
1892 "cp932",
1893 "cp949",
1894 "cp950",
1895 "euc_jis_2004",
1896 "euc_jisx0213",
1897 "euc_jp",
1898 "euc_kr",
1899 "gb18030",
1900 "gb2312",
1901 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001902 "hp_roman8",
1903 "hz",
1904 "idna",
1905 "iso2022_jp",
1906 "iso2022_jp_1",
1907 "iso2022_jp_2",
1908 "iso2022_jp_2004",
1909 "iso2022_jp_3",
1910 "iso2022_jp_ext",
1911 "iso2022_kr",
1912 "iso8859_1",
1913 "iso8859_10",
1914 "iso8859_11",
1915 "iso8859_13",
1916 "iso8859_14",
1917 "iso8859_15",
1918 "iso8859_16",
1919 "iso8859_2",
1920 "iso8859_3",
1921 "iso8859_4",
1922 "iso8859_5",
1923 "iso8859_6",
1924 "iso8859_7",
1925 "iso8859_8",
1926 "iso8859_9",
1927 "johab",
1928 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001929 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001930 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001931 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001932 "latin_1",
1933 "mac_cyrillic",
1934 "mac_greek",
1935 "mac_iceland",
1936 "mac_latin2",
1937 "mac_roman",
1938 "mac_turkish",
1939 "palmos",
1940 "ptcp154",
1941 "punycode",
1942 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001943 "shift_jis",
1944 "shift_jis_2004",
1945 "shift_jisx0213",
1946 "tis_620",
1947 "unicode_escape",
1948 "unicode_internal",
1949 "utf_16",
1950 "utf_16_be",
1951 "utf_16_le",
1952 "utf_7",
1953 "utf_8",
1954]
1955
1956if hasattr(codecs, "mbcs_encode"):
1957 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001958if hasattr(codecs, "oem_encode"):
1959 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001960
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001961# The following encoding is not tested, because it's not supposed
1962# to work:
1963# "undefined"
1964
1965# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001966broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001967 "punycode",
1968 "unicode_internal"
1969]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001970
Victor Stinnerf96418d2015-09-21 23:06:27 +02001971
Walter Dörwald3abcb012007-04-16 22:10:50 +00001972class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001973 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001974 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001975 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001976 name = codecs.lookup(encoding).name
1977 if encoding.endswith("_codec"):
1978 name += "_codec"
1979 elif encoding == "latin_1":
1980 name = "latin_1"
1981 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001982
Ezio Melottiadc417c2011-11-17 12:23:34 +02001983 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001984 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001985 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001986 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001987 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001988 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001989
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001990 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001991 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001992 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001993 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001994 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001995 for c in s:
1996 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001997 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001998 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001999 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002000 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02002001 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002002 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002003 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002004 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002005 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002006 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002007
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002008 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002009 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002010 try:
2011 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002012 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002013 pass
2014 else:
2015 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002016 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002017 for c in s:
2018 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002019 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002020 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002021 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002022 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002023 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002024 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002025 self.assertEqual(decodedresult, s,
2026 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002027
2028 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002029 result = "".join(codecs.iterdecode(
2030 codecs.iterencode(s, encoding), encoding))
2031 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002032
2033 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002034 result = "".join(codecs.iterdecode(
2035 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002036 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002037
Victor Stinner554f3f02010-06-16 23:33:54 +00002038 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002039 # check incremental decoder/encoder with errors argument
2040 try:
2041 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002042 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002043 pass
2044 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002045 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002046 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002047 decodedresult = "".join(decoder.decode(bytes([c]))
2048 for c in encodedresult)
2049 self.assertEqual(decodedresult, s,
2050 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002051
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002052 @support.cpython_only
2053 def test_basics_capi(self):
2054 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2055 s = "abc123" # all codecs should be able to encode these
2056 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002057 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002058 # check incremental decoder/encoder (fetched via the C API)
2059 try:
2060 cencoder = codec_incrementalencoder(encoding)
2061 except LookupError: # no IncrementalEncoder
2062 pass
2063 else:
2064 # check C API
2065 encodedresult = b""
2066 for c in s:
2067 encodedresult += cencoder.encode(c)
2068 encodedresult += cencoder.encode("", True)
2069 cdecoder = codec_incrementaldecoder(encoding)
2070 decodedresult = ""
2071 for c in encodedresult:
2072 decodedresult += cdecoder.decode(bytes([c]))
2073 decodedresult += cdecoder.decode(b"", True)
2074 self.assertEqual(decodedresult, s,
2075 "encoding=%r" % encoding)
2076
2077 if encoding not in ("idna", "mbcs"):
2078 # check incremental decoder/encoder with errors argument
2079 try:
2080 cencoder = codec_incrementalencoder(encoding, "ignore")
2081 except LookupError: # no IncrementalEncoder
2082 pass
2083 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002084 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002085 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2086 decodedresult = "".join(cdecoder.decode(bytes([c]))
2087 for c in encodedresult)
2088 self.assertEqual(decodedresult, s,
2089 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002090
Walter Dörwald729c31f2005-03-14 19:06:30 +00002091 def test_seek(self):
2092 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002093 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002094 for encoding in all_unicode_encodings:
2095 if encoding == "idna": # FIXME: See SF bug #1163178
2096 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002097 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002098 continue
Victor Stinner05010702011-05-27 16:50:40 +02002099 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002100 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002101 # Test that calling seek resets the internal codec state and buffers
2102 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002103 data = reader.read()
2104 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002105
Walter Dörwalde22d3392005-11-17 08:52:34 +00002106 def test_bad_decode_args(self):
2107 for encoding in all_unicode_encodings:
2108 decoder = codecs.getdecoder(encoding)
2109 self.assertRaises(TypeError, decoder)
2110 if encoding not in ("idna", "punycode"):
2111 self.assertRaises(TypeError, decoder, 42)
2112
2113 def test_bad_encode_args(self):
2114 for encoding in all_unicode_encodings:
2115 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002116 with support.check_warnings():
2117 # unicode-internal has been deprecated
2118 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002119
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002120 def test_encoding_map_type_initialized(self):
2121 from encodings import cp1140
2122 # This used to crash, we are only verifying there's no crash.
2123 table_type = type(cp1140.encoding_table)
2124 self.assertEqual(table_type, table_type)
2125
Walter Dörwald3abcb012007-04-16 22:10:50 +00002126 def test_decoder_state(self):
2127 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002128 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002129 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002130 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002131 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2132 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2133
Victor Stinnerf96418d2015-09-21 23:06:27 +02002134
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002135class CharmapTest(unittest.TestCase):
2136 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002137 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002138 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002139 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002140 )
2141
Ezio Melottib3aedd42010-11-20 19:04:17 +00002142 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002143 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2144 ("\U0010FFFFbc", 3)
2145 )
2146
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002147 self.assertRaises(UnicodeDecodeError,
2148 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2149 )
2150
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002151 self.assertRaises(UnicodeDecodeError,
2152 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2153 )
2154
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002155 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002156 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002157 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002158 )
2159
Ezio Melottib3aedd42010-11-20 19:04:17 +00002160 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002161 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002162 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002163 )
2164
Ezio Melottib3aedd42010-11-20 19:04:17 +00002165 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002166 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2167 ("ab\\x02", 3)
2168 )
2169
2170 self.assertEqual(
2171 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2172 ("ab\\x02", 3)
2173 )
2174
2175 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002176 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002177 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002178 )
2179
Ezio Melottib3aedd42010-11-20 19:04:17 +00002180 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002181 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002182 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002183 )
2184
Guido van Rossum805365e2007-05-07 22:24:25 +00002185 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002186 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002187 codecs.charmap_decode(allbytes, "ignore", ""),
2188 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002189 )
2190
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002191 def test_decode_with_int2str_map(self):
2192 self.assertEqual(
2193 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2194 {0: 'a', 1: 'b', 2: 'c'}),
2195 ("abc", 3)
2196 )
2197
2198 self.assertEqual(
2199 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2200 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2201 ("AaBbCc", 3)
2202 )
2203
2204 self.assertEqual(
2205 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2206 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2207 ("\U0010FFFFbc", 3)
2208 )
2209
2210 self.assertEqual(
2211 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2212 {0: 'a', 1: 'b', 2: ''}),
2213 ("ab", 3)
2214 )
2215
2216 self.assertRaises(UnicodeDecodeError,
2217 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2218 {0: 'a', 1: 'b'}
2219 )
2220
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002221 self.assertRaises(UnicodeDecodeError,
2222 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2223 {0: 'a', 1: 'b', 2: None}
2224 )
2225
2226 # Issue #14850
2227 self.assertRaises(UnicodeDecodeError,
2228 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2229 {0: 'a', 1: 'b', 2: '\ufffe'}
2230 )
2231
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002232 self.assertEqual(
2233 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2234 {0: 'a', 1: 'b'}),
2235 ("ab\ufffd", 3)
2236 )
2237
2238 self.assertEqual(
2239 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2240 {0: 'a', 1: 'b', 2: None}),
2241 ("ab\ufffd", 3)
2242 )
2243
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002244 # Issue #14850
2245 self.assertEqual(
2246 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2247 {0: 'a', 1: 'b', 2: '\ufffe'}),
2248 ("ab\ufffd", 3)
2249 )
2250
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002251 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002252 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2253 {0: 'a', 1: 'b'}),
2254 ("ab\\x02", 3)
2255 )
2256
2257 self.assertEqual(
2258 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2259 {0: 'a', 1: 'b', 2: None}),
2260 ("ab\\x02", 3)
2261 )
2262
2263 # Issue #14850
2264 self.assertEqual(
2265 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2266 {0: 'a', 1: 'b', 2: '\ufffe'}),
2267 ("ab\\x02", 3)
2268 )
2269
2270 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002271 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2272 {0: 'a', 1: 'b'}),
2273 ("ab", 3)
2274 )
2275
2276 self.assertEqual(
2277 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2278 {0: 'a', 1: 'b', 2: None}),
2279 ("ab", 3)
2280 )
2281
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002282 # Issue #14850
2283 self.assertEqual(
2284 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2285 {0: 'a', 1: 'b', 2: '\ufffe'}),
2286 ("ab", 3)
2287 )
2288
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002289 allbytes = bytes(range(256))
2290 self.assertEqual(
2291 codecs.charmap_decode(allbytes, "ignore", {}),
2292 ("", len(allbytes))
2293 )
2294
2295 def test_decode_with_int2int_map(self):
2296 a = ord('a')
2297 b = ord('b')
2298 c = ord('c')
2299
2300 self.assertEqual(
2301 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2302 {0: a, 1: b, 2: c}),
2303 ("abc", 3)
2304 )
2305
2306 # Issue #15379
2307 self.assertEqual(
2308 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2309 {0: 0x10FFFF, 1: b, 2: c}),
2310 ("\U0010FFFFbc", 3)
2311 )
2312
Antoine Pitroua1f76552012-09-23 20:00:04 +02002313 self.assertEqual(
2314 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2315 {0: sys.maxunicode, 1: b, 2: c}),
2316 (chr(sys.maxunicode) + "bc", 3)
2317 )
2318
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002319 self.assertRaises(TypeError,
2320 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002321 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002322 )
2323
2324 self.assertRaises(UnicodeDecodeError,
2325 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2326 {0: a, 1: b},
2327 )
2328
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002329 self.assertRaises(UnicodeDecodeError,
2330 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2331 {0: a, 1: b, 2: 0xFFFE},
2332 )
2333
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002334 self.assertEqual(
2335 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2336 {0: a, 1: b}),
2337 ("ab\ufffd", 3)
2338 )
2339
2340 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002341 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2342 {0: a, 1: b, 2: 0xFFFE}),
2343 ("ab\ufffd", 3)
2344 )
2345
2346 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002347 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2348 {0: a, 1: b}),
2349 ("ab\\x02", 3)
2350 )
2351
2352 self.assertEqual(
2353 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2354 {0: a, 1: b, 2: 0xFFFE}),
2355 ("ab\\x02", 3)
2356 )
2357
2358 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002359 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2360 {0: a, 1: b}),
2361 ("ab", 3)
2362 )
2363
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002364 self.assertEqual(
2365 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2366 {0: a, 1: b, 2: 0xFFFE}),
2367 ("ab", 3)
2368 )
2369
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002370
Thomas Wouters89f507f2006-12-13 04:49:30 +00002371class WithStmtTest(unittest.TestCase):
2372 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002373 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002374 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2375 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002376 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002377
2378 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002379 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002380 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002381 with codecs.StreamReaderWriter(f, info.streamreader,
2382 info.streamwriter, 'strict') as srw:
2383 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002384
Victor Stinnerf96418d2015-09-21 23:06:27 +02002385
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002386class TypesTest(unittest.TestCase):
2387 def test_decode_unicode(self):
2388 # Most decoders don't accept unicode input
2389 decoders = [
2390 codecs.utf_7_decode,
2391 codecs.utf_8_decode,
2392 codecs.utf_16_le_decode,
2393 codecs.utf_16_be_decode,
2394 codecs.utf_16_ex_decode,
2395 codecs.utf_32_decode,
2396 codecs.utf_32_le_decode,
2397 codecs.utf_32_be_decode,
2398 codecs.utf_32_ex_decode,
2399 codecs.latin_1_decode,
2400 codecs.ascii_decode,
2401 codecs.charmap_decode,
2402 ]
2403 if hasattr(codecs, "mbcs_decode"):
2404 decoders.append(codecs.mbcs_decode)
2405 for decoder in decoders:
2406 self.assertRaises(TypeError, decoder, "xxx")
2407
2408 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002409 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002410 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002411 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2412 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2413 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2414 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002415
Victor Stinnere3b47152011-12-09 20:49:49 +01002416 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2417 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002418 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2419 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002420
2421 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2422 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002423 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2424 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002425
Serhiy Storchakad6793772013-01-29 10:20:44 +02002426
2427class UnicodeEscapeTest(unittest.TestCase):
2428 def test_empty(self):
2429 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2430 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2431
2432 def test_raw_encode(self):
2433 encode = codecs.unicode_escape_encode
2434 for b in range(32, 127):
2435 if b != b'\\'[0]:
2436 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2437
2438 def test_raw_decode(self):
2439 decode = codecs.unicode_escape_decode
2440 for b in range(256):
2441 if b != b'\\'[0]:
2442 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2443
2444 def test_escape_encode(self):
2445 encode = codecs.unicode_escape_encode
2446 check = coding_checker(self, encode)
2447 check('\t', br'\t')
2448 check('\n', br'\n')
2449 check('\r', br'\r')
2450 check('\\', br'\\')
2451 for b in range(32):
2452 if chr(b) not in '\t\n\r':
2453 check(chr(b), ('\\x%02x' % b).encode())
2454 for b in range(127, 256):
2455 check(chr(b), ('\\x%02x' % b).encode())
2456 check('\u20ac', br'\u20ac')
2457 check('\U0001d120', br'\U0001d120')
2458
2459 def test_escape_decode(self):
2460 decode = codecs.unicode_escape_decode
2461 check = coding_checker(self, decode)
2462 check(b"[\\\n]", "[]")
2463 check(br'[\"]', '["]')
2464 check(br"[\']", "[']")
2465 check(br"[\\]", r"[\]")
2466 check(br"[\a]", "[\x07]")
2467 check(br"[\b]", "[\x08]")
2468 check(br"[\t]", "[\x09]")
2469 check(br"[\n]", "[\x0a]")
2470 check(br"[\v]", "[\x0b]")
2471 check(br"[\f]", "[\x0c]")
2472 check(br"[\r]", "[\x0d]")
2473 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002474 check(br"[\78]", "[\x078]")
2475 check(br"[\41]", "[!]")
2476 check(br"[\418]", "[!8]")
2477 check(br"[\101]", "[A]")
2478 check(br"[\1010]", "[A0]")
2479 check(br"[\x41]", "[A]")
2480 check(br"[\x410]", "[A0]")
2481 check(br"\u20ac", "\u20ac")
2482 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002483 for i in range(97, 123):
2484 b = bytes([i])
2485 if b not in b'abfnrtuvx':
2486 with self.assertWarns(DeprecationWarning):
2487 check(b"\\" + b, "\\" + chr(i))
2488 if b.upper() not in b'UN':
2489 with self.assertWarns(DeprecationWarning):
2490 check(b"\\" + b.upper(), "\\" + chr(i-32))
2491 with self.assertWarns(DeprecationWarning):
2492 check(br"\8", "\\8")
2493 with self.assertWarns(DeprecationWarning):
2494 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002495 with self.assertWarns(DeprecationWarning):
2496 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002497
2498 def test_decode_errors(self):
2499 decode = codecs.unicode_escape_decode
2500 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2501 for i in range(d):
2502 self.assertRaises(UnicodeDecodeError, decode,
2503 b"\\" + c + b"0"*i)
2504 self.assertRaises(UnicodeDecodeError, decode,
2505 b"[\\" + c + b"0"*i + b"]")
2506 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2507 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2508 self.assertEqual(decode(data, "replace"),
2509 ("[\ufffd]\ufffd", len(data)))
2510 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2511 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2512 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2513
2514
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002515class RawUnicodeEscapeTest(unittest.TestCase):
2516 def test_empty(self):
2517 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2518 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2519
2520 def test_raw_encode(self):
2521 encode = codecs.raw_unicode_escape_encode
2522 for b in range(256):
2523 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2524
2525 def test_raw_decode(self):
2526 decode = codecs.raw_unicode_escape_decode
2527 for b in range(256):
2528 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2529
2530 def test_escape_encode(self):
2531 encode = codecs.raw_unicode_escape_encode
2532 check = coding_checker(self, encode)
2533 for b in range(256):
2534 if b not in b'uU':
2535 check('\\' + chr(b), b'\\' + bytes([b]))
2536 check('\u20ac', br'\u20ac')
2537 check('\U0001d120', br'\U0001d120')
2538
2539 def test_escape_decode(self):
2540 decode = codecs.raw_unicode_escape_decode
2541 check = coding_checker(self, decode)
2542 for b in range(256):
2543 if b not in b'uU':
2544 check(b'\\' + bytes([b]), '\\' + chr(b))
2545 check(br"\u20ac", "\u20ac")
2546 check(br"\U0001d120", "\U0001d120")
2547
2548 def test_decode_errors(self):
2549 decode = codecs.raw_unicode_escape_decode
2550 for c, d in (b'u', 4), (b'U', 4):
2551 for i in range(d):
2552 self.assertRaises(UnicodeDecodeError, decode,
2553 b"\\" + c + b"0"*i)
2554 self.assertRaises(UnicodeDecodeError, decode,
2555 b"[\\" + c + b"0"*i + b"]")
2556 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2557 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2558 self.assertEqual(decode(data, "replace"),
2559 ("[\ufffd]\ufffd", len(data)))
2560 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2561 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2562 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2563
2564
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002565class EscapeEncodeTest(unittest.TestCase):
2566
2567 def test_escape_encode(self):
2568 tests = [
2569 (b'', (b'', 0)),
2570 (b'foobar', (b'foobar', 6)),
2571 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2572 (b'a\'b', (b"a\\'b", 3)),
2573 (b'b\\c', (b'b\\\\c', 3)),
2574 (b'c\nd', (b'c\\nd', 3)),
2575 (b'd\re', (b'd\\re', 3)),
2576 (b'f\x7fg', (b'f\\x7fg', 3)),
2577 ]
2578 for data, output in tests:
2579 with self.subTest(data=data):
2580 self.assertEqual(codecs.escape_encode(data), output)
2581 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2582 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2583
2584
Martin v. Löwis43c57782009-05-10 08:15:24 +00002585class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002586
2587 def test_utf8(self):
2588 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002589 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002590 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002591 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002592 b"foo\x80bar")
2593 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002594 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002595 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002596 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002597 b"\xed\xb0\x80")
2598
2599 def test_ascii(self):
2600 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002601 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002602 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002603 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002604 b"foo\x80bar")
2605
2606 def test_charmap(self):
2607 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002608 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002609 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002610 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002611 b"foo\xa5bar")
2612
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002613 def test_latin1(self):
2614 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002615 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002616 b"\xe4\xeb\xef\xf6\xfc")
2617
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002618
Victor Stinner3fed0872010-05-22 02:16:27 +00002619class BomTest(unittest.TestCase):
2620 def test_seek0(self):
2621 data = "1234567890"
2622 tests = ("utf-16",
2623 "utf-16-le",
2624 "utf-16-be",
2625 "utf-32",
2626 "utf-32-le",
2627 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002628 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002629 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002630 # Check if the BOM is written only once
2631 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002632 f.write(data)
2633 f.write(data)
2634 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002635 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002636 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002637 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002638
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002639 # Check that the BOM is written after a seek(0)
2640 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2641 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002642 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002643 f.seek(0)
2644 f.write(data)
2645 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002646 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002647
2648 # (StreamWriter) Check that the BOM is written after a seek(0)
2649 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002650 f.writer.write(data[0])
2651 self.assertNotEqual(f.writer.tell(), 0)
2652 f.writer.seek(0)
2653 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002654 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002655 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002656
Victor Stinner05010702011-05-27 16:50:40 +02002657 # Check that the BOM is not written after a seek() at a position
2658 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002659 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2660 f.write(data)
2661 f.seek(f.tell())
2662 f.write(data)
2663 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002664 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002665
Victor Stinner05010702011-05-27 16:50:40 +02002666 # (StreamWriter) Check that the BOM is not written after a seek()
2667 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002668 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002669 f.writer.write(data)
2670 f.writer.seek(f.writer.tell())
2671 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002672 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002673 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002674
Victor Stinner3fed0872010-05-22 02:16:27 +00002675
Georg Brandl02524622010-12-02 18:06:51 +00002676bytes_transform_encodings = [
2677 "base64_codec",
2678 "uu_codec",
2679 "quopri_codec",
2680 "hex_codec",
2681]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002682
2683transform_aliases = {
2684 "base64_codec": ["base64", "base_64"],
2685 "uu_codec": ["uu"],
2686 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2687 "hex_codec": ["hex"],
2688 "rot_13": ["rot13"],
2689}
2690
Georg Brandl02524622010-12-02 18:06:51 +00002691try:
2692 import zlib
2693except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002694 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002695else:
2696 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002697 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002698try:
2699 import bz2
2700except ImportError:
2701 pass
2702else:
2703 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002704 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002705
Victor Stinnerf96418d2015-09-21 23:06:27 +02002706
Georg Brandl02524622010-12-02 18:06:51 +00002707class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002708
Georg Brandl02524622010-12-02 18:06:51 +00002709 def test_basics(self):
2710 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002711 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002712 with self.subTest(encoding=encoding):
2713 # generic codecs interface
2714 (o, size) = codecs.getencoder(encoding)(binput)
2715 self.assertEqual(size, len(binput))
2716 (i, size) = codecs.getdecoder(encoding)(o)
2717 self.assertEqual(size, len(o))
2718 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002719
Georg Brandl02524622010-12-02 18:06:51 +00002720 def test_read(self):
2721 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002722 with self.subTest(encoding=encoding):
2723 sin = codecs.encode(b"\x80", encoding)
2724 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2725 sout = reader.read()
2726 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002727
2728 def test_readline(self):
2729 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002730 with self.subTest(encoding=encoding):
2731 sin = codecs.encode(b"\x80", encoding)
2732 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2733 sout = reader.readline()
2734 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002735
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002736 def test_buffer_api_usage(self):
2737 # We check all the transform codecs accept memoryview input
2738 # for encoding and decoding
2739 # and also that they roundtrip correctly
2740 original = b"12345\x80"
2741 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002742 with self.subTest(encoding=encoding):
2743 data = original
2744 view = memoryview(data)
2745 data = codecs.encode(data, encoding)
2746 view_encoded = codecs.encode(view, encoding)
2747 self.assertEqual(view_encoded, data)
2748 view = memoryview(data)
2749 data = codecs.decode(data, encoding)
2750 self.assertEqual(data, original)
2751 view_decoded = codecs.decode(view, encoding)
2752 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002753
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002754 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002755 # Check binary -> binary codecs give a good error for str input
2756 bad_input = "bad input type"
2757 for encoding in bytes_transform_encodings:
2758 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002759 fmt = (r"{!r} is not a text encoding; "
2760 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002761 msg = fmt.format(encoding)
2762 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002763 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002765
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002766 def test_text_to_binary_blacklists_text_transforms(self):
2767 # Check str.encode gives a good error message for str -> str codecs
2768 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002769 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002770 with self.assertRaisesRegex(LookupError, msg):
2771 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002772
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002773 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002774 # Check bytes.decode and bytearray.decode give a good error
2775 # message for binary -> binary codecs
2776 data = b"encode first to ensure we meet any format restrictions"
2777 for encoding in bytes_transform_encodings:
2778 with self.subTest(encoding=encoding):
2779 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002780 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002781 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002782 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002784 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002785 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002786 bytearray(encoded_data).decode(encoding)
2787
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002788 def test_binary_to_text_blacklists_text_transforms(self):
2789 # Check str -> str codec gives a good error for binary input
2790 for bad_input in (b"immutable", bytearray(b"mutable")):
2791 with self.subTest(bad_input=bad_input):
2792 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002793 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002794 with self.assertRaisesRegex(LookupError, msg) as failure:
2795 bad_input.decode("rot_13")
2796 self.assertIsNone(failure.exception.__cause__)
2797
Zachary Wareefa2e042013-12-30 14:54:11 -06002798 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002799 def test_custom_zlib_error_is_wrapped(self):
2800 # Check zlib codec gives a good error for malformed input
2801 msg = "^decoding with 'zlib_codec' codec failed"
2802 with self.assertRaisesRegex(Exception, msg) as failure:
2803 codecs.decode(b"hello", "zlib_codec")
2804 self.assertIsInstance(failure.exception.__cause__,
2805 type(failure.exception))
2806
2807 def test_custom_hex_error_is_wrapped(self):
2808 # Check hex codec gives a good error for malformed input
2809 msg = "^decoding with 'hex_codec' codec failed"
2810 with self.assertRaisesRegex(Exception, msg) as failure:
2811 codecs.decode(b"hello", "hex_codec")
2812 self.assertIsInstance(failure.exception.__cause__,
2813 type(failure.exception))
2814
2815 # Unfortunately, the bz2 module throws OSError, which the codec
2816 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002817
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002818 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2819 def test_aliases(self):
2820 for codec_name, aliases in transform_aliases.items():
2821 expected_name = codecs.lookup(codec_name).name
2822 for alias in aliases:
2823 with self.subTest(alias=alias):
2824 info = codecs.lookup(alias)
2825 self.assertEqual(info.name, expected_name)
2826
Martin Panter06171bd2015-09-12 00:34:28 +00002827 def test_quopri_stateless(self):
2828 # Should encode with quotetabs=True
2829 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2830 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2831 # But should still support unescaped tabs and spaces
2832 unescaped = b"space tab eol\n"
2833 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2834
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002835 def test_uu_invalid(self):
2836 # Missing "begin" line
2837 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2838
Nick Coghlan8b097b42013-11-13 23:49:21 +10002839
2840# The codec system tries to wrap exceptions in order to ensure the error
2841# mentions the operation being performed and the codec involved. We
2842# currently *only* want this to happen for relatively stateless
2843# exceptions, where the only significant information they contain is their
2844# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002845
2846# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002847# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002848_TEST_CODECS = {}
2849
2850def _get_test_codec(codec_name):
2851 return _TEST_CODECS.get(codec_name)
2852codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2853
Nick Coghlan8fad1672014-09-15 23:50:44 +12002854try:
2855 # Issue #22166: Also need to clear the internal cache in CPython
2856 from _codecs import _forget_codec
2857except ImportError:
2858 def _forget_codec(codec_name):
2859 pass
2860
2861
Nick Coghlan8b097b42013-11-13 23:49:21 +10002862class ExceptionChainingTest(unittest.TestCase):
2863
2864 def setUp(self):
2865 # There's no way to unregister a codec search function, so we just
2866 # ensure we render this one fairly harmless after the test
2867 # case finishes by using the test case repr as the codec name
2868 # The codecs module normalizes codec names, although this doesn't
2869 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002870 # We also make sure we use a truly unique id for the custom codec
2871 # to avoid issues with the codec cache when running these tests
2872 # multiple times (e.g. when hunting for refleaks)
2873 unique_id = repr(self) + str(id(self))
2874 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2875
2876 # We store the object to raise on the instance because of a bad
2877 # interaction between the codec caching (which means we can't
2878 # recreate the codec entry) and regrtest refleak hunting (which
2879 # runs the same test instance multiple times). This means we
2880 # need to ensure the codecs call back in to the instance to find
2881 # out which exception to raise rather than binding them in a
2882 # closure to an object that may change on the next run
2883 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002884
Nick Coghlan4e553e22013-11-16 00:35:34 +10002885 def tearDown(self):
2886 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002887 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2888 encodings._cache.pop(self.codec_name, None)
2889 try:
2890 _forget_codec(self.codec_name)
2891 except KeyError:
2892 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002893
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002894 def set_codec(self, encode, decode):
2895 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002896 name=self.codec_name)
2897 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002898
2899 @contextlib.contextmanager
2900 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002901 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002902 operation, self.codec_name, exc_type.__name__, msg)
2903 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2904 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002905 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002906 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002907
2908 def raise_obj(self, *args, **kwds):
2909 # Helper to dynamically change the object raised by a test codec
2910 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002911
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002912 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002913 self.obj_to_raise = obj_to_raise
2914 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002915 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002916 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002917 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002918 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002919 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002920 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002921 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002922 codecs.decode(b"bytes input", self.codec_name)
2923
2924 def test_raise_by_type(self):
2925 self.check_wrapped(RuntimeError, "")
2926
2927 def test_raise_by_value(self):
2928 msg = "This should be wrapped"
2929 self.check_wrapped(RuntimeError(msg), msg)
2930
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002931 def test_raise_grandchild_subclass_exact_size(self):
2932 msg = "This should be wrapped"
2933 class MyRuntimeError(RuntimeError):
2934 __slots__ = ()
2935 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2936
2937 def test_raise_subclass_with_weakref_support(self):
2938 msg = "This should be wrapped"
2939 class MyRuntimeError(RuntimeError):
2940 pass
2941 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2942
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002943 def check_not_wrapped(self, obj_to_raise, msg):
2944 def raise_obj(*args, **kwds):
2945 raise obj_to_raise
2946 self.set_codec(raise_obj, raise_obj)
2947 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002948 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002949 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002950 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002951 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002952 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002953 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002954 codecs.decode(b"bytes input", self.codec_name)
2955
2956 def test_init_override_is_not_wrapped(self):
2957 class CustomInit(RuntimeError):
2958 def __init__(self):
2959 pass
2960 self.check_not_wrapped(CustomInit, "")
2961
2962 def test_new_override_is_not_wrapped(self):
2963 class CustomNew(RuntimeError):
2964 def __new__(cls):
2965 return super().__new__(cls)
2966 self.check_not_wrapped(CustomNew, "")
2967
2968 def test_instance_attribute_is_not_wrapped(self):
2969 msg = "This should NOT be wrapped"
2970 exc = RuntimeError(msg)
2971 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002972 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002973
2974 def test_non_str_arg_is_not_wrapped(self):
2975 self.check_not_wrapped(RuntimeError(1), "1")
2976
2977 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002978 msg_re = r"^\('a', 'b', 'c'\)$"
2979 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002980
2981 # http://bugs.python.org/issue19609
2982 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002983 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002984 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002985 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002986 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002987 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002988 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002989 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002990 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002991 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002992 codecs.decode(b"bytes input", self.codec_name)
2993
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002994 def test_unflagged_non_text_codec_handling(self):
2995 # The stdlib non-text codecs are now marked so they're
2996 # pre-emptively skipped by the text model related methods
2997 # However, third party codecs won't be flagged, so we still make
2998 # sure the case where an inappropriate output type is produced is
2999 # handled appropriately
3000 def encode_to_str(*args, **kwds):
3001 return "not bytes!", 0
3002 def decode_to_bytes(*args, **kwds):
3003 return b"not str!", 0
3004 self.set_codec(encode_to_str, decode_to_bytes)
3005 # No input or output type checks on the codecs module functions
3006 encoded = codecs.encode(None, self.codec_name)
3007 self.assertEqual(encoded, "not bytes!")
3008 decoded = codecs.decode(None, self.codec_name)
3009 self.assertEqual(decoded, b"not str!")
3010 # Text model methods should complain
3011 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04003012 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003013 msg = fmt.format(self.codec_name)
3014 with self.assertRaisesRegex(TypeError, msg):
3015 "str_input".encode(self.codec_name)
3016 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003017 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003018 msg = fmt.format(self.codec_name)
3019 with self.assertRaisesRegex(TypeError, msg):
3020 b"bytes input".decode(self.codec_name)
3021
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003022
Georg Brandl02524622010-12-02 18:06:51 +00003023
Victor Stinner62be4fb2011-10-18 21:46:37 +02003024@unittest.skipUnless(sys.platform == 'win32',
3025 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003026class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003027 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003028 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003029
Victor Stinner3a50e702011-10-18 21:21:00 +02003030 def test_invalid_code_page(self):
3031 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3032 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003033 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3034 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003035
3036 def test_code_page_name(self):
3037 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3038 codecs.code_page_encode, 932, '\xff')
3039 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003040 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003041 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003042 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003043
3044 def check_decode(self, cp, tests):
3045 for raw, errors, expected in tests:
3046 if expected is not None:
3047 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003048 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003049 except UnicodeDecodeError as err:
3050 self.fail('Unable to decode %a from "cp%s" with '
3051 'errors=%r: %s' % (raw, cp, errors, err))
3052 self.assertEqual(decoded[0], expected,
3053 '%a.decode("cp%s", %r)=%a != %a'
3054 % (raw, cp, errors, decoded[0], expected))
3055 # assert 0 <= decoded[1] <= len(raw)
3056 self.assertGreaterEqual(decoded[1], 0)
3057 self.assertLessEqual(decoded[1], len(raw))
3058 else:
3059 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003060 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003061
3062 def check_encode(self, cp, tests):
3063 for text, errors, expected in tests:
3064 if expected is not None:
3065 try:
3066 encoded = codecs.code_page_encode(cp, text, errors)
3067 except UnicodeEncodeError as err:
3068 self.fail('Unable to encode %a to "cp%s" with '
3069 'errors=%r: %s' % (text, cp, errors, err))
3070 self.assertEqual(encoded[0], expected,
3071 '%a.encode("cp%s", %r)=%a != %a'
3072 % (text, cp, errors, encoded[0], expected))
3073 self.assertEqual(encoded[1], len(text))
3074 else:
3075 self.assertRaises(UnicodeEncodeError,
3076 codecs.code_page_encode, cp, text, errors)
3077
3078 def test_cp932(self):
3079 self.check_encode(932, (
3080 ('abc', 'strict', b'abc'),
3081 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003082 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003083 ('\xff', 'strict', None),
3084 ('[\xff]', 'ignore', b'[]'),
3085 ('[\xff]', 'replace', b'[y]'),
3086 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003087 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003088 ('[\xff]', 'namereplace',
3089 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003090 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003091 ('\udcff', 'strict', None),
3092 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3093 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003094 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003095 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003096 (b'abc', 'strict', 'abc'),
3097 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3098 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003099 (b'[\xff]', 'strict', None),
3100 (b'[\xff]', 'ignore', '[]'),
3101 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003102 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003103 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003104 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003105 (b'\x81\x00abc', 'strict', None),
3106 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003107 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003108 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003109 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003110
3111 def test_cp1252(self):
3112 self.check_encode(1252, (
3113 ('abc', 'strict', b'abc'),
3114 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3115 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003116 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003117 ('\u0141', 'strict', None),
3118 ('\u0141', 'ignore', b''),
3119 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003120 ('\udc98', 'surrogateescape', b'\x98'),
3121 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003122 ))
3123 self.check_decode(1252, (
3124 (b'abc', 'strict', 'abc'),
3125 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3126 (b'\xff', 'strict', '\xff'),
3127 ))
3128
3129 def test_cp_utf7(self):
3130 cp = 65000
3131 self.check_encode(cp, (
3132 ('abc', 'strict', b'abc'),
3133 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3134 ('\U0010ffff', 'strict', b'+2//f/w-'),
3135 ('\udc80', 'strict', b'+3IA-'),
3136 ('\ufffd', 'strict', b'+//0-'),
3137 ))
3138 self.check_decode(cp, (
3139 (b'abc', 'strict', 'abc'),
3140 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3141 (b'+2//f/w-', 'strict', '\U0010ffff'),
3142 (b'+3IA-', 'strict', '\udc80'),
3143 (b'+//0-', 'strict', '\ufffd'),
3144 # invalid bytes
3145 (b'[+/]', 'strict', '[]'),
3146 (b'[\xff]', 'strict', '[\xff]'),
3147 ))
3148
Victor Stinner3a50e702011-10-18 21:21:00 +02003149 def test_multibyte_encoding(self):
3150 self.check_decode(932, (
3151 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3152 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3153 ))
3154 self.check_decode(self.CP_UTF8, (
3155 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3156 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3157 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003158 self.check_encode(self.CP_UTF8, (
3159 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3160 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3161 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003162
3163 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003164 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3165 self.assertEqual(decoded, ('', 0))
3166
Victor Stinner3a50e702011-10-18 21:21:00 +02003167 decoded = codecs.code_page_decode(932,
3168 b'\xe9\x80\xe9', 'strict',
3169 False)
3170 self.assertEqual(decoded, ('\u9a3e', 2))
3171
3172 decoded = codecs.code_page_decode(932,
3173 b'\xe9\x80\xe9\x80', 'strict',
3174 False)
3175 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3176
3177 decoded = codecs.code_page_decode(932,
3178 b'abc', 'strict',
3179 False)
3180 self.assertEqual(decoded, ('abc', 3))
3181
Steve Dowerf5aba582016-09-06 19:42:27 -07003182 def test_mbcs_alias(self):
3183 # Check that looking up our 'default' codepage will return
3184 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003185 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003186 codec = codecs.lookup('cp123')
3187 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003188
Victor Stinner3a50e702011-10-18 21:21:00 +02003189
Victor Stinnerf96418d2015-09-21 23:06:27 +02003190class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003191 def test_encode(self):
3192 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3193
3194 def test_encode_error(self):
3195 for data, error_handler, expected in (
3196 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3197 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3198 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003199 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3200 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003201 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3202 ):
3203 with self.subTest(data=data, error_handler=error_handler,
3204 expected=expected):
3205 self.assertEqual(data.encode('ascii', error_handler),
3206 expected)
3207
3208 def test_encode_surrogateescape_error(self):
3209 with self.assertRaises(UnicodeEncodeError):
3210 # the first character can be decoded, but not the second
3211 '\udc80\xff'.encode('ascii', 'surrogateescape')
3212
Victor Stinnerf96418d2015-09-21 23:06:27 +02003213 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003214 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3215
3216 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003217 for data, error_handler, expected in (
3218 (b'[\x80\xff]', 'ignore', '[]'),
3219 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3220 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3221 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3222 ):
3223 with self.subTest(data=data, error_handler=error_handler,
3224 expected=expected):
3225 self.assertEqual(data.decode('ascii', error_handler),
3226 expected)
3227
3228
Victor Stinnerc3713e92015-09-29 12:32:13 +02003229class Latin1Test(unittest.TestCase):
3230 def test_encode(self):
3231 for data, expected in (
3232 ('abc', b'abc'),
3233 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3234 ):
3235 with self.subTest(data=data, expected=expected):
3236 self.assertEqual(data.encode('latin1'), expected)
3237
3238 def test_encode_errors(self):
3239 for data, error_handler, expected in (
3240 ('[\u20ac\udc80]', 'ignore', b'[]'),
3241 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003242 ('[\u20ac\U000abcde]', 'backslashreplace',
3243 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003244 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3245 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3246 ):
3247 with self.subTest(data=data, error_handler=error_handler,
3248 expected=expected):
3249 self.assertEqual(data.encode('latin1', error_handler),
3250 expected)
3251
3252 def test_encode_surrogateescape_error(self):
3253 with self.assertRaises(UnicodeEncodeError):
3254 # the first character can be decoded, but not the second
3255 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3256
3257 def test_decode(self):
3258 for data, expected in (
3259 (b'abc', 'abc'),
3260 (b'[\x80\xff]', '[\x80\xff]'),
3261 ):
3262 with self.subTest(data=data, expected=expected):
3263 self.assertEqual(data.decode('latin1'), expected)
3264
3265
Fred Drake2e2be372001-09-20 21:33:42 +00003266if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003267 unittest.main()