blob: a59a5e21358e7b6b85e1ec62d4a516e0157387b9 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
13 import ctypes
14except ImportError:
15 ctypes = None
16 SIZEOF_WCHAR_T = -1
17else:
18 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000019
Serhiy Storchakad6793772013-01-29 10:20:44 +020020def coding_checker(self, coder):
21 def check(input, expect):
22 self.assertEqual(coder(input), (expect, len(input)))
23 return check
24
Victor Stinnerf96418d2015-09-21 23:06:27 +020025
Walter Dörwald69652032004-09-07 20:24:22 +000026class Queue(object):
27 """
28 queue: write bytes at one end, read bytes from the other end
29 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000030 def __init__(self, buffer):
31 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000032
33 def write(self, chars):
34 self._buffer += chars
35
36 def read(self, size=-1):
37 if size<0:
38 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000040 return s
41 else:
42 s = self._buffer[:size]
43 self._buffer = self._buffer[size:]
44 return s
45
Victor Stinnerf96418d2015-09-21 23:06:27 +020046
Walter Dörwald3abcb012007-04-16 22:10:50 +000047class MixInCheckStateHandling:
48 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000049 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000050 d = codecs.getincrementaldecoder(encoding)()
51 part1 = d.decode(s[:i])
52 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000053 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 # Check that the condition stated in the documentation for
55 # IncrementalDecoder.getstate() holds
56 if not state[1]:
57 # reset decoder to the default state without anything buffered
58 d.setstate((state[0][:0], 0))
59 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000060 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000061 # The decoder must return to the same state
62 self.assertEqual(state, d.getstate())
63 # Create a new decoder and set it to the state
64 # we extracted from the old one
65 d = codecs.getincrementaldecoder(encoding)()
66 d.setstate(state)
67 part2 = d.decode(s[i:], True)
68 self.assertEqual(u, part1+part2)
69
70 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000071 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000072 d = codecs.getincrementalencoder(encoding)()
73 part1 = d.encode(u[:i])
74 state = d.getstate()
75 d = codecs.getincrementalencoder(encoding)()
76 d.setstate(state)
77 part2 = d.encode(u[i:], True)
78 self.assertEqual(s, part1+part2)
79
Victor Stinnerf96418d2015-09-21 23:06:27 +020080
Ezio Melotti5d3dba02013-01-11 06:02:07 +020081class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000083 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000084 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000085 # the StreamReader and check that the results equal the appropriate
86 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020088 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000092 result += r.read()
93 self.assertEqual(result, partialresult)
94 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000095 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000097
Martin Panter7462b6492015-11-02 03:37:02 +000098 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +000099 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000102 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 self.assertEqual(result, partialresult)
104 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 self.assertEqual(d.decode(b"", True), "")
106 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000107
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 self.assertEqual(result, partialresult)
114 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 self.assertEqual(d.decode(b"", True), "")
116 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000117
118 # check iterdecode()
119 encoded = input.encode(self.encoding)
120 self.assertEqual(
121 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000122 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000123 )
124
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125 def test_readline(self):
126 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000127 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 return codecs.getreader(self.encoding)(stream)
129
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200131 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 lines = []
133 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 if not line:
136 break
137 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000138 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000140 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
141 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
142 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 self.assertEqual(readalllines(s, True), sexpected)
144 self.assertEqual(readalllines(s, False), sexpectednoends)
145 self.assertEqual(readalllines(s, True, 10), sexpected)
146 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000147
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200148 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200152 for (i, lineend) in enumerate(lineends):
153 vw.append((i*200+200)*"\u3042" + lineend)
154 vwo.append((i*200+200)*"\u3042")
155 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
156 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200161 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200169 self.assertEqual(
170 reader.readline(keepends=True),
171 "xxx\n",
172 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000174 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 self.assertEqual(
176 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200179 self.assertEqual(
180 reader.readline(keepends=False),
181 "xxx",
182 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200184 def test_mixed_readline_and_read(self):
185 lines = ["Humpty Dumpty sat on a wall,\n",
186 "Humpty Dumpty had a great fall.\r\n",
187 "All the king's horses and all the king's men\r",
188 "Couldn't put Humpty together again."]
189 data = ''.join(lines)
190 def getreader():
191 stream = io.BytesIO(data.encode(self.encoding))
192 return codecs.getreader(self.encoding)(stream)
193
194 # Issue #8260: Test readline() followed by read()
195 f = getreader()
196 self.assertEqual(f.readline(), lines[0])
197 self.assertEqual(f.read(), ''.join(lines[1:]))
198 self.assertEqual(f.read(), '')
199
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200200 # Issue #32110: Test readline() followed by read(n)
201 f = getreader()
202 self.assertEqual(f.readline(), lines[0])
203 self.assertEqual(f.read(1), lines[1][0])
204 self.assertEqual(f.read(0), '')
205 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
206
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200207 # Issue #16636: Test readline() followed by readlines()
208 f = getreader()
209 self.assertEqual(f.readline(), lines[0])
210 self.assertEqual(f.readlines(), lines[1:])
211 self.assertEqual(f.read(), '')
212
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200213 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200214 f = getreader()
215 self.assertEqual(f.read(size=40, chars=5), data[:5])
216 self.assertEqual(f.read(), data[5:])
217 self.assertEqual(f.read(), '')
218
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200219 # Issue #32110: Test read(n) followed by read(n)
220 f = getreader()
221 self.assertEqual(f.read(size=40, chars=5), data[:5])
222 self.assertEqual(f.read(1), data[5])
223 self.assertEqual(f.read(0), '')
224 self.assertEqual(f.read(100), data[6:106])
225
226 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200227 f = getreader()
228 self.assertEqual(f.read(size=40, chars=5), data[:5])
229 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
230 self.assertEqual(f.read(), '')
231
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000232 def test_bug1175396(self):
233 s = [
234 '<%!--===================================================\r\n',
235 ' BLOG index page: show recent articles,\r\n',
236 ' today\'s articles, or articles of a specific date.\r\n',
237 '========================================================--%>\r\n',
238 '<%@inputencoding="ISO-8859-1"%>\r\n',
239 '<%@pagetemplate=TEMPLATE.y%>\r\n',
240 '<%@import=import frog.util, frog%>\r\n',
241 '<%@import=import frog.objects%>\r\n',
242 '<%@import=from frog.storageerrors import StorageError%>\r\n',
243 '<%\r\n',
244 '\r\n',
245 'import logging\r\n',
246 'log=logging.getLogger("Snakelets.logger")\r\n',
247 '\r\n',
248 '\r\n',
249 'user=self.SessionCtx.user\r\n',
250 'storageEngine=self.SessionCtx.storageEngine\r\n',
251 '\r\n',
252 '\r\n',
253 'def readArticlesFromDate(date, count=None):\r\n',
254 ' entryids=storageEngine.listBlogEntries(date)\r\n',
255 ' entryids.reverse() # descending\r\n',
256 ' if count:\r\n',
257 ' entryids=entryids[:count]\r\n',
258 ' try:\r\n',
259 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
260 ' except StorageError,x:\r\n',
261 ' log.error("Error loading articles: "+str(x))\r\n',
262 ' self.abort("cannot load articles")\r\n',
263 '\r\n',
264 'showdate=None\r\n',
265 '\r\n',
266 'arg=self.Request.getArg()\r\n',
267 'if arg=="today":\r\n',
268 ' #-------------------- TODAY\'S ARTICLES\r\n',
269 ' self.write("<h2>Today\'s articles</h2>")\r\n',
270 ' showdate = frog.util.isodatestr() \r\n',
271 ' entries = readArticlesFromDate(showdate)\r\n',
272 'elif arg=="active":\r\n',
273 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
274 ' self.Yredirect("active.y")\r\n',
275 'elif arg=="login":\r\n',
276 ' #-------------------- LOGIN PAGE redirect\r\n',
277 ' self.Yredirect("login.y")\r\n',
278 'elif arg=="date":\r\n',
279 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
280 ' showdate = self.Request.getParameter("date")\r\n',
281 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
282 ' entries = readArticlesFromDate(showdate)\r\n',
283 'else:\r\n',
284 ' #-------------------- RECENT ARTICLES\r\n',
285 ' self.write("<h2>Recent articles</h2>")\r\n',
286 ' dates=storageEngine.listBlogEntryDates()\r\n',
287 ' if dates:\r\n',
288 ' entries=[]\r\n',
289 ' SHOWAMOUNT=10\r\n',
290 ' for showdate in dates:\r\n',
291 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
292 ' if len(entries)>=SHOWAMOUNT:\r\n',
293 ' break\r\n',
294 ' \r\n',
295 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000296 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200297 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000298 for (i, line) in enumerate(reader):
299 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000300
301 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000302 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200303 writer = codecs.getwriter(self.encoding)(q)
304 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000307 writer.write("foo\r")
308 self.assertEqual(reader.readline(keepends=False), "foo")
309 writer.write("\nbar\r")
310 self.assertEqual(reader.readline(keepends=False), "")
311 self.assertEqual(reader.readline(keepends=False), "bar")
312 writer.write("baz")
313 self.assertEqual(reader.readline(keepends=False), "baz")
314 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
316 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 writer.write("foo\r")
318 self.assertEqual(reader.readline(keepends=True), "foo\r")
319 writer.write("\nbar\r")
320 self.assertEqual(reader.readline(keepends=True), "\n")
321 self.assertEqual(reader.readline(keepends=True), "bar\r")
322 writer.write("baz")
323 self.assertEqual(reader.readline(keepends=True), "baz")
324 self.assertEqual(reader.readline(keepends=True), "")
325 writer.write("foo\r\n")
326 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000327
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
330 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
331 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000332
333 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000334 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200335 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336 self.assertEqual(reader.readline(), s1)
337 self.assertEqual(reader.readline(), s2)
338 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000339 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000340
341 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
343 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
344 s3 = "stillokay:bbbbxx\r\n"
345 s4 = "broken!!!!badbad\r\n"
346 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000347
348 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000349 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200350 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000351 self.assertEqual(reader.readline(), s1)
352 self.assertEqual(reader.readline(), s2)
353 self.assertEqual(reader.readline(), s3)
354 self.assertEqual(reader.readline(), s4)
355 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000356 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000357
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200358 ill_formed_sequence_replace = "\ufffd"
359
360 def test_lone_surrogates(self):
361 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
362 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
363 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200364 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
365 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200366 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
367 "[&#56448;]".encode(self.encoding))
368 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
369 "[]".encode(self.encoding))
370 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
371 "[?]".encode(self.encoding))
372
Victor Stinner01ada392015-10-01 21:54:51 +0200373 # sequential surrogate characters
374 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
375 "[]".encode(self.encoding))
376 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
377 "[??]".encode(self.encoding))
378
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200379 bom = "".encode(self.encoding)
380 for before, after in [("\U00010fff", "A"), ("[", "]"),
381 ("A", "\U00010fff")]:
382 before_sequence = before.encode(self.encoding)[len(bom):]
383 after_sequence = after.encode(self.encoding)[len(bom):]
384 test_string = before + "\uDC80" + after
385 test_sequence = (bom + before_sequence +
386 self.ill_formed_sequence + after_sequence)
387 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
388 self.encoding)
389 self.assertEqual(test_string.encode(self.encoding,
390 "surrogatepass"),
391 test_sequence)
392 self.assertEqual(test_sequence.decode(self.encoding,
393 "surrogatepass"),
394 test_string)
395 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
396 before + after)
397 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
398 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200399 backslashreplace = ''.join('\\x%02x' % b
400 for b in self.ill_formed_sequence)
401 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
402 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200403
Victor Stinnerf96418d2015-09-21 23:06:27 +0200404
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200405class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200407 if sys.byteorder == 'little':
408 ill_formed_sequence = b"\x80\xdc\x00\x00"
409 else:
410 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000411
412 spamle = (b'\xff\xfe\x00\x00'
413 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
414 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
415 spambe = (b'\x00\x00\xfe\xff'
416 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
417 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
418
419 def test_only_one_bom(self):
420 _,_,reader,writer = codecs.lookup(self.encoding)
421 # encode some stream
422 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200423 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424 f.write("spam")
425 f.write("spam")
426 d = s.getvalue()
427 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000428 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000429 # try to read it back
430 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200431 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000432 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000433
434 def test_badbom(self):
435 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200436 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000437 self.assertRaises(UnicodeError, f.read)
438
439 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200440 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000441 self.assertRaises(UnicodeError, f.read)
442
443 def test_partial(self):
444 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200445 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 [
447 "", # first byte of BOM read
448 "", # second byte of BOM read
449 "", # third byte of BOM read
450 "", # fourth byte of BOM read => byteorder known
451 "",
452 "",
453 "",
454 "\x00",
455 "\x00",
456 "\x00",
457 "\x00",
458 "\x00\xff",
459 "\x00\xff",
460 "\x00\xff",
461 "\x00\xff",
462 "\x00\xff\u0100",
463 "\x00\xff\u0100",
464 "\x00\xff\u0100",
465 "\x00\xff\u0100",
466 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200467 "\x00\xff\u0100\uffff",
468 "\x00\xff\u0100\uffff",
469 "\x00\xff\u0100\uffff",
470 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000471 ]
472 )
473
Georg Brandl791f4e12009-09-17 11:41:24 +0000474 def test_handlers(self):
475 self.assertEqual(('\ufffd', 1),
476 codecs.utf_32_decode(b'\x01', 'replace', True))
477 self.assertEqual(('', 1),
478 codecs.utf_32_decode(b'\x01', 'ignore', True))
479
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480 def test_errors(self):
481 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
482 b"\xff", "strict", True)
483
484 def test_decoder_state(self):
485 self.check_state_handling_decode(self.encoding,
486 "spamspam", self.spamle)
487 self.check_state_handling_decode(self.encoding,
488 "spamspam", self.spambe)
489
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000490 def test_issue8941(self):
491 # Issue #8941: insufficient result allocation when decoding into
492 # surrogate pairs on UCS-2 builds.
493 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
494 self.assertEqual('\U00010000' * 1024,
495 codecs.utf_32_decode(encoded_le)[0])
496 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
497 self.assertEqual('\U00010000' * 1024,
498 codecs.utf_32_decode(encoded_be)[0])
499
Victor Stinnerf96418d2015-09-21 23:06:27 +0200500
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200501class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000502 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200503 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000504
505 def test_partial(self):
506 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200507 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000508 [
509 "",
510 "",
511 "",
512 "\x00",
513 "\x00",
514 "\x00",
515 "\x00",
516 "\x00\xff",
517 "\x00\xff",
518 "\x00\xff",
519 "\x00\xff",
520 "\x00\xff\u0100",
521 "\x00\xff\u0100",
522 "\x00\xff\u0100",
523 "\x00\xff\u0100",
524 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200525 "\x00\xff\u0100\uffff",
526 "\x00\xff\u0100\uffff",
527 "\x00\xff\u0100\uffff",
528 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529 ]
530 )
531
532 def test_simple(self):
533 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
534
535 def test_errors(self):
536 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
537 b"\xff", "strict", True)
538
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000539 def test_issue8941(self):
540 # Issue #8941: insufficient result allocation when decoding into
541 # surrogate pairs on UCS-2 builds.
542 encoded = b'\x00\x00\x01\x00' * 1024
543 self.assertEqual('\U00010000' * 1024,
544 codecs.utf_32_le_decode(encoded)[0])
545
Victor Stinnerf96418d2015-09-21 23:06:27 +0200546
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200547class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000548 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200549 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000550
551 def test_partial(self):
552 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200553 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000554 [
555 "",
556 "",
557 "",
558 "\x00",
559 "\x00",
560 "\x00",
561 "\x00",
562 "\x00\xff",
563 "\x00\xff",
564 "\x00\xff",
565 "\x00\xff",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100",
568 "\x00\xff\u0100",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200571 "\x00\xff\u0100\uffff",
572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000575 ]
576 )
577
578 def test_simple(self):
579 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
580
581 def test_errors(self):
582 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
583 b"\xff", "strict", True)
584
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000585 def test_issue8941(self):
586 # Issue #8941: insufficient result allocation when decoding into
587 # surrogate pairs on UCS-2 builds.
588 encoded = b'\x00\x01\x00\x00' * 1024
589 self.assertEqual('\U00010000' * 1024,
590 codecs.utf_32_be_decode(encoded)[0])
591
592
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200593class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200595 if sys.byteorder == 'little':
596 ill_formed_sequence = b"\x80\xdc"
597 else:
598 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000599
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000600 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
601 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000602
603 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000604 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000605 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000606 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200607 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000608 f.write("spam")
609 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000610 d = s.getvalue()
611 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000612 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000613 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000614 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200615 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000616 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000617
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000618 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000619 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200620 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000621 self.assertRaises(UnicodeError, f.read)
622
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000623 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200624 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000625 self.assertRaises(UnicodeError, f.read)
626
Walter Dörwald69652032004-09-07 20:24:22 +0000627 def test_partial(self):
628 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200629 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000631 "", # first byte of BOM read
632 "", # second byte of BOM read => byteorder known
633 "",
634 "\x00",
635 "\x00",
636 "\x00\xff",
637 "\x00\xff",
638 "\x00\xff\u0100",
639 "\x00\xff\u0100",
640 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200641 "\x00\xff\u0100\uffff",
642 "\x00\xff\u0100\uffff",
643 "\x00\xff\u0100\uffff",
644 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000645 ]
646 )
647
Georg Brandl791f4e12009-09-17 11:41:24 +0000648 def test_handlers(self):
649 self.assertEqual(('\ufffd', 1),
650 codecs.utf_16_decode(b'\x01', 'replace', True))
651 self.assertEqual(('', 1),
652 codecs.utf_16_decode(b'\x01', 'ignore', True))
653
Walter Dörwalde22d3392005-11-17 08:52:34 +0000654 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000655 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000656 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000657
658 def test_decoder_state(self):
659 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000661 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000663
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000664 def test_bug691291(self):
665 # Files are always opened in binary mode, even if no binary mode was
666 # specified. This means that no automatic conversion of '\n' is done
667 # on reading and writing.
668 s1 = 'Hello\r\nworld\r\n'
669
670 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200671 self.addCleanup(support.unlink, support.TESTFN)
672 with open(support.TESTFN, 'wb') as fp:
673 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200674 with support.check_warnings(('', DeprecationWarning)):
675 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
676 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200677 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000678
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200679class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000680 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200681 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000682
683 def test_partial(self):
684 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200685 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000686 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000687 "",
688 "\x00",
689 "\x00",
690 "\x00\xff",
691 "\x00\xff",
692 "\x00\xff\u0100",
693 "\x00\xff\u0100",
694 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200695 "\x00\xff\u0100\uffff",
696 "\x00\xff\u0100\uffff",
697 "\x00\xff\u0100\uffff",
698 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000699 ]
700 )
701
Walter Dörwalde22d3392005-11-17 08:52:34 +0000702 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200703 tests = [
704 (b'\xff', '\ufffd'),
705 (b'A\x00Z', 'A\ufffd'),
706 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
707 (b'\x00\xd8', '\ufffd'),
708 (b'\x00\xd8A', '\ufffd'),
709 (b'\x00\xd8A\x00', '\ufffdA'),
710 (b'\x00\xdcA\x00', '\ufffdA'),
711 ]
712 for raw, expected in tests:
713 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
714 raw, 'strict', True)
715 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000716
Victor Stinner53a9dd72010-12-08 22:25:45 +0000717 def test_nonbmp(self):
718 self.assertEqual("\U00010203".encode(self.encoding),
719 b'\x00\xd8\x03\xde')
720 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
721 "\U00010203")
722
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200723class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000724 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200725 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000726
727 def test_partial(self):
728 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200729 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000730 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000731 "",
732 "\x00",
733 "\x00",
734 "\x00\xff",
735 "\x00\xff",
736 "\x00\xff\u0100",
737 "\x00\xff\u0100",
738 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200739 "\x00\xff\u0100\uffff",
740 "\x00\xff\u0100\uffff",
741 "\x00\xff\u0100\uffff",
742 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000743 ]
744 )
745
Walter Dörwalde22d3392005-11-17 08:52:34 +0000746 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200747 tests = [
748 (b'\xff', '\ufffd'),
749 (b'\x00A\xff', 'A\ufffd'),
750 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
751 (b'\xd8\x00', '\ufffd'),
752 (b'\xd8\x00\xdc', '\ufffd'),
753 (b'\xd8\x00\x00A', '\ufffdA'),
754 (b'\xdc\x00\x00A', '\ufffdA'),
755 ]
756 for raw, expected in tests:
757 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
758 raw, 'strict', True)
759 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000760
Victor Stinner53a9dd72010-12-08 22:25:45 +0000761 def test_nonbmp(self):
762 self.assertEqual("\U00010203".encode(self.encoding),
763 b'\xd8\x00\xde\x03')
764 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
765 "\U00010203")
766
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200767class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000768 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200769 ill_formed_sequence = b"\xed\xb2\x80"
770 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200771 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000772
773 def test_partial(self):
774 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200775 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000776 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000777 "\x00",
778 "\x00",
779 "\x00\xff",
780 "\x00\xff",
781 "\x00\xff\u07ff",
782 "\x00\xff\u07ff",
783 "\x00\xff\u07ff",
784 "\x00\xff\u07ff\u0800",
785 "\x00\xff\u07ff\u0800",
786 "\x00\xff\u07ff\u0800",
787 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200788 "\x00\xff\u07ff\u0800\uffff",
789 "\x00\xff\u07ff\u0800\uffff",
790 "\x00\xff\u07ff\u0800\uffff",
791 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000792 ]
793 )
794
Walter Dörwald3abcb012007-04-16 22:10:50 +0000795 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000796 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000797 self.check_state_handling_decode(self.encoding,
798 u, u.encode(self.encoding))
799
Victor Stinner1d65d912015-10-05 13:43:50 +0200800 def test_decode_error(self):
801 for data, error_handler, expected in (
802 (b'[\x80\xff]', 'ignore', '[]'),
803 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
804 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
805 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
806 ):
807 with self.subTest(data=data, error_handler=error_handler,
808 expected=expected):
809 self.assertEqual(data.decode(self.encoding, error_handler),
810 expected)
811
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000812 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200813 super().test_lone_surrogates()
814 # not sure if this is making sense for
815 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200816 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
817 self.BOM + b'[\x80]')
818
819 with self.assertRaises(UnicodeEncodeError) as cm:
820 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
821 exc = cm.exception
822 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000823
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000824 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200825 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
826 self.BOM + b"abc\xed\xa0\x80def")
827 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
828 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
829 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
830 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
831
832 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000833 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200834 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200836
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000837 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700838 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200839 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200840 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200841 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000842
Victor Stinnerf96418d2015-09-21 23:06:27 +0200843
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200844@unittest.skipUnless(sys.platform == 'win32',
845 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200846class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200847 encoding = "cp65001"
848
849 def test_encode(self):
850 tests = [
851 ('abc', 'strict', b'abc'),
852 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
853 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700854 ('\udc80', 'strict', None),
855 ('\udc80', 'ignore', b''),
856 ('\udc80', 'replace', b'?'),
857 ('\udc80', 'backslashreplace', b'\\udc80'),
858 ('\udc80', 'namereplace', b'\\udc80'),
859 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200860 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200861 for text, errors, expected in tests:
862 if expected is not None:
863 try:
864 encoded = text.encode('cp65001', errors)
865 except UnicodeEncodeError as err:
866 self.fail('Unable to encode %a to cp65001 with '
867 'errors=%r: %s' % (text, errors, err))
868 self.assertEqual(encoded, expected,
869 '%a.encode("cp65001", %r)=%a != %a'
870 % (text, errors, encoded, expected))
871 else:
872 self.assertRaises(UnicodeEncodeError,
873 text.encode, "cp65001", errors)
874
875 def test_decode(self):
876 tests = [
877 (b'abc', 'strict', 'abc'),
878 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
879 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
880 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
881 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
882 # invalid bytes
883 (b'[\xff]', 'strict', None),
884 (b'[\xff]', 'ignore', '[]'),
885 (b'[\xff]', 'replace', '[\ufffd]'),
886 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700887 (b'[\xed\xb2\x80]', 'strict', None),
888 (b'[\xed\xb2\x80]', 'ignore', '[]'),
889 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200890 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200891 for raw, errors, expected in tests:
892 if expected is not None:
893 try:
894 decoded = raw.decode('cp65001', errors)
895 except UnicodeDecodeError as err:
896 self.fail('Unable to decode %a from cp65001 with '
897 'errors=%r: %s' % (raw, errors, err))
898 self.assertEqual(decoded, expected,
899 '%a.decode("cp65001", %r)=%a != %a'
900 % (raw, errors, decoded, expected))
901 else:
902 self.assertRaises(UnicodeDecodeError,
903 raw.decode, 'cp65001', errors)
904
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200905 def test_lone_surrogates(self):
906 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
907 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
908 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
909 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200910 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
911 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200912 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
913 b'[&#56448;]')
914 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
915 b'[\x80]')
916 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
917 b'[]')
918 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
919 b'[?]')
920
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200921 def test_surrogatepass_handler(self):
922 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
923 b"abc\xed\xa0\x80def")
924 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
925 "abc\ud800def")
926 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
927 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
928 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
929 "\U00010fff\uD800")
930 self.assertTrue(codecs.lookup_error("surrogatepass"))
931
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200932
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200933class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000934 encoding = "utf-7"
935
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300936 def test_ascii(self):
937 # Set D (directly encoded characters)
938 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
939 'abcdefghijklmnopqrstuvwxyz'
940 '0123456789'
941 '\'(),-./:?')
942 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
943 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
944 # Set O (optional direct characters)
945 set_o = ' !"#$%&*;<=>@[]^_`{|}'
946 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
947 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
948 # +
949 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
950 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
951 # White spaces
952 ws = ' \t\n\r'
953 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
954 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
955 # Other ASCII characters
956 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
957 set(set_d + set_o + '+' + ws)))
958 self.assertEqual(other_ascii.encode(self.encoding),
959 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
960 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
961
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000962 def test_partial(self):
963 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200964 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000965 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200966 'a',
967 'a',
968 'a+',
969 'a+-',
970 'a+-b',
971 'a+-b',
972 'a+-b',
973 'a+-b',
974 'a+-b',
975 'a+-b\x00',
976 'a+-b\x00c',
977 'a+-b\x00c',
978 'a+-b\x00c',
979 'a+-b\x00c',
980 'a+-b\x00c',
981 'a+-b\x00c\x80',
982 'a+-b\x00c\x80d',
983 'a+-b\x00c\x80d',
984 'a+-b\x00c\x80d',
985 'a+-b\x00c\x80d',
986 'a+-b\x00c\x80d',
987 'a+-b\x00c\x80d\u0100',
988 'a+-b\x00c\x80d\u0100e',
989 'a+-b\x00c\x80d\u0100e',
990 'a+-b\x00c\x80d\u0100e',
991 'a+-b\x00c\x80d\u0100e',
992 'a+-b\x00c\x80d\u0100e',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e\U00010000',
997 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000998 ]
999 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001000
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001001 def test_errors(self):
1002 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001003 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001004 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001005 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001006 (b'a+IK', 'a\ufffd'),
1007 (b'a+IK-b', 'a\ufffdb'),
1008 (b'a+IK,b', 'a\ufffdb'),
1009 (b'a+IKx', 'a\u20ac\ufffd'),
1010 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1011 (b'a+IKwgr', 'a\u20ac\ufffd'),
1012 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1013 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1014 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1015 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1016 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1017 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1018 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1019 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1020 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001021 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1022 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001023 ]
1024 for raw, expected in tests:
1025 with self.subTest(raw=raw):
1026 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1027 raw, 'strict', True)
1028 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1029
1030 def test_nonbmp(self):
1031 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1032 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1033 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001034 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1035 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1036 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1037 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1038 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1039 b'+IKwgrNgB3KA-')
1040 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1041 '\u20ac\u20ac\U000104A0')
1042 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1043 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001044
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001045 def test_lone_surrogates(self):
1046 tests = [
1047 (b'a+2AE-b', 'a\ud801b'),
1048 (b'a+2AE\xffb', 'a\ufffdb'),
1049 (b'a+2AE', 'a\ufffd'),
1050 (b'a+2AEA-b', 'a\ufffdb'),
1051 (b'a+2AH-b', 'a\ufffdb'),
1052 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1053 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1054 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1055 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1056 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1057 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1058 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1059 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1060 ]
1061 for raw, expected in tests:
1062 with self.subTest(raw=raw):
1063 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001064
1065
Walter Dörwalde22d3392005-11-17 08:52:34 +00001066class UTF16ExTest(unittest.TestCase):
1067
1068 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001069 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001070
1071 def test_bad_args(self):
1072 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1073
1074class ReadBufferTest(unittest.TestCase):
1075
1076 def test_array(self):
1077 import array
1078 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001079 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001080 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001081 )
1082
1083 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001084 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001085
1086 def test_bad_args(self):
1087 self.assertRaises(TypeError, codecs.readbuffer_encode)
1088 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1089
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001090class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001091 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001092 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001093
1094 def test_partial(self):
1095 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001096 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001097 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001098 "",
1099 "",
1100 "", # First BOM has been read and skipped
1101 "",
1102 "",
1103 "\ufeff", # Second BOM has been read and emitted
1104 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001105 "\ufeff\x00", # First byte of encoded "\xff" read
1106 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1107 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1108 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001109 "\ufeff\x00\xff\u07ff",
1110 "\ufeff\x00\xff\u07ff",
1111 "\ufeff\x00\xff\u07ff\u0800",
1112 "\ufeff\x00\xff\u07ff\u0800",
1113 "\ufeff\x00\xff\u07ff\u0800",
1114 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001115 "\ufeff\x00\xff\u07ff\u0800\uffff",
1116 "\ufeff\x00\xff\u07ff\u0800\uffff",
1117 "\ufeff\x00\xff\u07ff\u0800\uffff",
1118 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001119 ]
1120 )
1121
Thomas Wouters89f507f2006-12-13 04:49:30 +00001122 def test_bug1601501(self):
1123 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001124 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001125
Walter Dörwald3abcb012007-04-16 22:10:50 +00001126 def test_bom(self):
1127 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001128 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001129 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1130
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001131 def test_stream_bom(self):
1132 unistring = "ABC\u00A1\u2200XYZ"
1133 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1134
1135 reader = codecs.getreader("utf-8-sig")
1136 for sizehint in [None] + list(range(1, 11)) + \
1137 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001138 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001139 ostream = io.StringIO()
1140 while 1:
1141 if sizehint is not None:
1142 data = istream.read(sizehint)
1143 else:
1144 data = istream.read()
1145
1146 if not data:
1147 break
1148 ostream.write(data)
1149
1150 got = ostream.getvalue()
1151 self.assertEqual(got, unistring)
1152
1153 def test_stream_bare(self):
1154 unistring = "ABC\u00A1\u2200XYZ"
1155 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1156
1157 reader = codecs.getreader("utf-8-sig")
1158 for sizehint in [None] + list(range(1, 11)) + \
1159 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001160 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001161 ostream = io.StringIO()
1162 while 1:
1163 if sizehint is not None:
1164 data = istream.read(sizehint)
1165 else:
1166 data = istream.read()
1167
1168 if not data:
1169 break
1170 ostream.write(data)
1171
1172 got = ostream.getvalue()
1173 self.assertEqual(got, unistring)
1174
1175class EscapeDecodeTest(unittest.TestCase):
1176 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001177 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001178 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001179
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001180 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001181 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001182 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001183 b = bytes([b])
1184 if b != b'\\':
1185 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001186
1187 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001188 decode = codecs.escape_decode
1189 check = coding_checker(self, decode)
1190 check(b"[\\\n]", b"[]")
1191 check(br'[\"]', b'["]')
1192 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001193 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001194 check(br"[\a]", b"[\x07]")
1195 check(br"[\b]", b"[\x08]")
1196 check(br"[\t]", b"[\x09]")
1197 check(br"[\n]", b"[\x0a]")
1198 check(br"[\v]", b"[\x0b]")
1199 check(br"[\f]", b"[\x0c]")
1200 check(br"[\r]", b"[\x0d]")
1201 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001202 check(br"[\78]", b"[\x078]")
1203 check(br"[\41]", b"[!]")
1204 check(br"[\418]", b"[!8]")
1205 check(br"[\101]", b"[A]")
1206 check(br"[\1010]", b"[A0]")
1207 check(br"[\501]", b"[A]")
1208 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001209 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001210 for i in range(97, 123):
1211 b = bytes([i])
1212 if b not in b'abfnrtvx':
1213 with self.assertWarns(DeprecationWarning):
1214 check(b"\\" + b, b"\\" + b)
1215 with self.assertWarns(DeprecationWarning):
1216 check(b"\\" + b.upper(), b"\\" + b.upper())
1217 with self.assertWarns(DeprecationWarning):
1218 check(br"\8", b"\\8")
1219 with self.assertWarns(DeprecationWarning):
1220 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001221 with self.assertWarns(DeprecationWarning):
1222 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001223
1224 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001225 decode = codecs.escape_decode
1226 self.assertRaises(ValueError, decode, br"\x")
1227 self.assertRaises(ValueError, decode, br"[\x]")
1228 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1229 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1230 self.assertRaises(ValueError, decode, br"\x0")
1231 self.assertRaises(ValueError, decode, br"[\x0]")
1232 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1233 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001234
Victor Stinnerf96418d2015-09-21 23:06:27 +02001235
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001236class RecodingTest(unittest.TestCase):
1237 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001238 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001239 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001240 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001241 f2.close()
1242 # Python used to crash on this at exit because of a refcount
1243 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001244
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001245 self.assertTrue(f.closed)
1246
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247# From RFC 3492
1248punycode_testcases = [
1249 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001250 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1251 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001252 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001253 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001254 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001255 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001257 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001258 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1261 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1262 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001263 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1266 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1267 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001268 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001269 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001270 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001271 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1272 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1273 "\u0939\u0948\u0902",
1274 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275
1276 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001277 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001278 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1279 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280
1281 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001282 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1283 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1284 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001285 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1286 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287
1288 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001289 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1290 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1291 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1292 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001293 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001294
1295 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001296 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1297 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1298 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1299 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1300 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001301 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302
1303 # (K) Vietnamese:
1304 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1305 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001306 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1307 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1308 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1309 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001310 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311
Martin v. Löwis2548c732003-04-18 10:39:54 +00001312 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001313 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001314 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001315
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001317 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1318 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1319 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001320 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321
1322 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001323 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1324 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1325 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001326 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327
1328 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001329 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001330 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001331
1332 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001333 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1334 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001335 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336
1337 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001338 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001339 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001340
1341 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001342 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001343 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001344
1345 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001346 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1347 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001348 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349 ]
1350
1351for i in punycode_testcases:
1352 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001353 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354
Victor Stinnerf96418d2015-09-21 23:06:27 +02001355
Martin v. Löwis2548c732003-04-18 10:39:54 +00001356class PunycodeTest(unittest.TestCase):
1357 def test_encode(self):
1358 for uni, puny in punycode_testcases:
1359 # Need to convert both strings to lower case, since
1360 # some of the extended encodings use upper case, but our
1361 # code produces only lower case. Converting just puny to
1362 # lower is also insufficient, since some of the input characters
1363 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001364 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001365 str(uni.encode("punycode"), "ascii").lower(),
1366 str(puny, "ascii").lower()
1367 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368
1369 def test_decode(self):
1370 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001371 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001372 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001373 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374
Victor Stinnerf96418d2015-09-21 23:06:27 +02001375
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001376class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001377 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001378 def test_bug1251300(self):
1379 # Decoding with unicode_internal used to not correctly handle "code
1380 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001381 ok = [
1382 (b"\x00\x10\xff\xff", "\U0010ffff"),
1383 (b"\x00\x00\x01\x01", "\U00000101"),
1384 (b"", ""),
1385 ]
1386 not_ok = [
1387 b"\x7f\xff\xff\xff",
1388 b"\x80\x00\x00\x00",
1389 b"\x81\x00\x00\x00",
1390 b"\x00",
1391 b"\x00\x00\x00\x00\x00",
1392 ]
1393 for internal, uni in ok:
1394 if sys.byteorder == "little":
1395 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001396 with support.check_warnings():
1397 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001398 for internal in not_ok:
1399 if sys.byteorder == "little":
1400 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001401 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001402 'deprecated', DeprecationWarning)):
1403 self.assertRaises(UnicodeDecodeError, internal.decode,
1404 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001405 if sys.byteorder == "little":
1406 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001407 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001408 else:
1409 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001410 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001411 with support.check_warnings():
1412 self.assertRaises(UnicodeDecodeError,
1413 invalid.decode, "unicode_internal")
1414 with support.check_warnings():
1415 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1416 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001417 with support.check_warnings():
1418 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1419 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001420
Victor Stinner182d90d2011-09-29 19:53:55 +02001421 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001422 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001423 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001424 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001425 'deprecated', DeprecationWarning)):
1426 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001427 except UnicodeDecodeError as ex:
1428 self.assertEqual("unicode_internal", ex.encoding)
1429 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1430 self.assertEqual(4, ex.start)
1431 self.assertEqual(8, ex.end)
1432 else:
1433 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001434
Victor Stinner182d90d2011-09-29 19:53:55 +02001435 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001436 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001437 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1438 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001439 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001440 'deprecated', DeprecationWarning)):
1441 ab = "ab".encode("unicode_internal").decode()
1442 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1443 "ascii"),
1444 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001445 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001446
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001447 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001448 with support.check_warnings(('unicode_internal codec has been '
1449 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001450 # Issue 3739
1451 encoder = codecs.getencoder("unicode_internal")
1452 self.assertEqual(encoder("a")[1], 1)
1453 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1454
1455 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001456
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1458nameprep_tests = [
1459 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001460 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1461 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1462 b'\xb8\x8f\xef\xbb\xbf',
1463 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001464 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'CAFE',
1466 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 # 3.3 Case folding 8bit U+00DF (german sharp s).
1468 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001469 (b'\xc3\x9f',
1470 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001472 (b'\xc4\xb0',
1473 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001475 (b'\xc5\x83\xcd\xba',
1476 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1478 # XXX: skip this as it fails in UCS-2 mode
1479 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1480 # 'telc\xe2\x88\x95kg\xcf\x83'),
1481 (None, None),
1482 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1484 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 (b'\xe1\xbe\xb7',
1487 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001488 # 3.9 Self-reverting case folding U+01F0 and normalization.
1489 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001490 (b'\xc7\xb0',
1491 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 (b'\xce\x90',
1494 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001495 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 (b'\xce\xb0',
1497 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'\xe1\xba\x96',
1500 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001502 (b'\xe1\xbd\x96',
1503 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 (b' ',
1506 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001508 (b'\xc2\xa0',
1509 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001511 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001512 None),
1513 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001514 (b'\xe2\x80\x80',
1515 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001517 (b'\xe2\x80\x8b',
1518 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 (b'\xe3\x80\x80',
1521 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001522 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001523 (b'\x10\x7f',
1524 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001525 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001526 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001527 None),
1528 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001529 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001530 None),
1531 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001532 (b'\xef\xbb\xbf',
1533 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001534 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001535 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001536 None),
1537 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001538 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 None),
1540 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001541 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001542 None),
1543 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001544 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001545 None),
1546 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001548 None),
1549 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001550 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001551 None),
1552 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001553 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001554 None),
1555 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001556 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001557 None),
1558 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001559 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001560 None),
1561 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001562 (b'\xcd\x81',
1563 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001564 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001565 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001566 None),
1567 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001569 None),
1570 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001572 None),
1573 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001575 None),
1576 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001577 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001578 None),
1579 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001581 None),
1582 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 (b'foo\xef\xb9\xb6bar',
1584 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001585 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001586 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001587 None),
1588 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001589 (b'\xd8\xa71\xd8\xa8',
1590 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001591 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001592 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001593 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001594 # None),
1595 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001596 # 3.44 Larger test (shrinking).
1597 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001598 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1599 b'\xaa\xce\xb0\xe2\x80\x80',
1600 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001601 # 3.45 Larger test (expanding).
1602 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001603 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1604 b'\x80',
1605 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1606 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1607 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001608 ]
1609
1610
1611class NameprepTest(unittest.TestCase):
1612 def test_nameprep(self):
1613 from encodings.idna import nameprep
1614 for pos, (orig, prepped) in enumerate(nameprep_tests):
1615 if orig is None:
1616 # Skipped
1617 continue
1618 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001619 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001620 if prepped is None:
1621 # Input contains prohibited characters
1622 self.assertRaises(UnicodeError, nameprep, orig)
1623 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001624 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001625 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001626 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001627 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001628 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001629
Victor Stinnerf96418d2015-09-21 23:06:27 +02001630
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001631class IDNACodecTest(unittest.TestCase):
1632 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001633 self.assertEqual(str(b"python.org", "idna"), "python.org")
1634 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1635 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1636 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001637
1638 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001639 self.assertEqual("python.org".encode("idna"), b"python.org")
1640 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1641 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1642 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001643
Martin v. Löwis8b595142005-08-25 11:03:38 +00001644 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001645 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001646 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001647 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001648
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001649 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001650 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001651 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001652 "python.org"
1653 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001654 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001655 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001656 "python.org."
1657 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001658 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001659 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001660 "pyth\xf6n.org."
1661 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001662 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001663 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001664 "pyth\xf6n.org."
1665 )
1666
1667 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001668 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1669 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1670 self.assertEqual(decoder.decode(b"rg"), "")
1671 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001672
1673 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001674 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1675 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1676 self.assertEqual(decoder.decode(b"rg."), "org.")
1677 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001678
1679 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001680 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001681 b"".join(codecs.iterencode("python.org", "idna")),
1682 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001683 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001684 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001685 b"".join(codecs.iterencode("python.org.", "idna")),
1686 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001687 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001689 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1690 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001691 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001692 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001693 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1694 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001695 )
1696
1697 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001698 self.assertEqual(encoder.encode("\xe4x"), b"")
1699 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1700 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001701
1702 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001703 self.assertEqual(encoder.encode("\xe4x"), b"")
1704 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1705 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001706
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001707 def test_errors(self):
1708 """Only supports "strict" error handler"""
1709 "python.org".encode("idna", "strict")
1710 b"python.org".decode("idna", "strict")
1711 for errors in ("ignore", "replace", "backslashreplace",
1712 "surrogateescape"):
1713 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1714 self.assertRaises(Exception,
1715 b"python.org".decode, "idna", errors)
1716
Victor Stinnerf96418d2015-09-21 23:06:27 +02001717
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001718class CodecsModuleTest(unittest.TestCase):
1719
1720 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1722 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001723 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001724 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001725 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001726
Victor Stinnera57dfd02014-05-14 17:13:14 +02001727 # test keywords
1728 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1729 '\xe4\xf6\xfc')
1730 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1731 '[]')
1732
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001733 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001734 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1735 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001736 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001737 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001738 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001739 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001740
Victor Stinnera57dfd02014-05-14 17:13:14 +02001741 # test keywords
1742 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1743 b'\xe4\xf6\xfc')
1744 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1745 b'[]')
1746
Walter Dörwald063e1e82004-10-28 13:04:26 +00001747 def test_register(self):
1748 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001749 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001750
1751 def test_lookup(self):
1752 self.assertRaises(TypeError, codecs.lookup)
1753 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001754 self.assertRaises(LookupError, codecs.lookup, " ")
1755
1756 def test_getencoder(self):
1757 self.assertRaises(TypeError, codecs.getencoder)
1758 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1759
1760 def test_getdecoder(self):
1761 self.assertRaises(TypeError, codecs.getdecoder)
1762 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1763
1764 def test_getreader(self):
1765 self.assertRaises(TypeError, codecs.getreader)
1766 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1767
1768 def test_getwriter(self):
1769 self.assertRaises(TypeError, codecs.getwriter)
1770 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001771
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001772 def test_lookup_issue1813(self):
1773 # Issue #1813: under Turkish locales, lookup of some codecs failed
1774 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001775 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001776 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1777 try:
1778 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1779 except locale.Error:
1780 # Unsupported locale on this system
1781 self.skipTest('test needs Turkish locale')
1782 c = codecs.lookup('ASCII')
1783 self.assertEqual(c.name, 'ascii')
1784
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001785 def test_all(self):
1786 api = (
1787 "encode", "decode",
1788 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1789 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1790 "getencoder", "getdecoder", "getincrementalencoder",
1791 "getincrementaldecoder", "getreader", "getwriter",
1792 "register_error", "lookup_error",
1793 "strict_errors", "replace_errors", "ignore_errors",
1794 "xmlcharrefreplace_errors", "backslashreplace_errors",
1795 "namereplace_errors",
1796 "open", "EncodedFile",
1797 "iterencode", "iterdecode",
1798 "BOM", "BOM_BE", "BOM_LE",
1799 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1800 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1801 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1802 "StreamReaderWriter", "StreamRecoder",
1803 )
1804 self.assertCountEqual(api, codecs.__all__)
1805 for api in codecs.__all__:
1806 getattr(codecs, api)
1807
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001808 def test_open(self):
1809 self.addCleanup(support.unlink, support.TESTFN)
1810 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1811 with self.subTest(mode), \
1812 codecs.open(support.TESTFN, mode, 'ascii') as file:
1813 self.assertIsInstance(file, codecs.StreamReaderWriter)
1814
1815 def test_undefined(self):
1816 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1817 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1818 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1819 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1820 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1821 self.assertRaises(UnicodeError,
1822 codecs.encode, 'abc', 'undefined', errors)
1823 self.assertRaises(UnicodeError,
1824 codecs.decode, b'abc', 'undefined', errors)
1825
Victor Stinnerf96418d2015-09-21 23:06:27 +02001826
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001827class StreamReaderTest(unittest.TestCase):
1828
1829 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001830 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001831 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001832
1833 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001834 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001835 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001836
Victor Stinnerf96418d2015-09-21 23:06:27 +02001837
Thomas Wouters89f507f2006-12-13 04:49:30 +00001838class EncodedFileTest(unittest.TestCase):
1839
1840 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001841 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001842 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001843 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001844
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001845 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001846 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001847 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001848 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001849
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001850all_unicode_encodings = [
1851 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001852 "big5",
1853 "big5hkscs",
1854 "charmap",
1855 "cp037",
1856 "cp1006",
1857 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001858 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001859 "cp1140",
1860 "cp1250",
1861 "cp1251",
1862 "cp1252",
1863 "cp1253",
1864 "cp1254",
1865 "cp1255",
1866 "cp1256",
1867 "cp1257",
1868 "cp1258",
1869 "cp424",
1870 "cp437",
1871 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001872 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001873 "cp737",
1874 "cp775",
1875 "cp850",
1876 "cp852",
1877 "cp855",
1878 "cp856",
1879 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001880 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001881 "cp860",
1882 "cp861",
1883 "cp862",
1884 "cp863",
1885 "cp864",
1886 "cp865",
1887 "cp866",
1888 "cp869",
1889 "cp874",
1890 "cp875",
1891 "cp932",
1892 "cp949",
1893 "cp950",
1894 "euc_jis_2004",
1895 "euc_jisx0213",
1896 "euc_jp",
1897 "euc_kr",
1898 "gb18030",
1899 "gb2312",
1900 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001901 "hp_roman8",
1902 "hz",
1903 "idna",
1904 "iso2022_jp",
1905 "iso2022_jp_1",
1906 "iso2022_jp_2",
1907 "iso2022_jp_2004",
1908 "iso2022_jp_3",
1909 "iso2022_jp_ext",
1910 "iso2022_kr",
1911 "iso8859_1",
1912 "iso8859_10",
1913 "iso8859_11",
1914 "iso8859_13",
1915 "iso8859_14",
1916 "iso8859_15",
1917 "iso8859_16",
1918 "iso8859_2",
1919 "iso8859_3",
1920 "iso8859_4",
1921 "iso8859_5",
1922 "iso8859_6",
1923 "iso8859_7",
1924 "iso8859_8",
1925 "iso8859_9",
1926 "johab",
1927 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001928 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001929 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001930 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001931 "latin_1",
1932 "mac_cyrillic",
1933 "mac_greek",
1934 "mac_iceland",
1935 "mac_latin2",
1936 "mac_roman",
1937 "mac_turkish",
1938 "palmos",
1939 "ptcp154",
1940 "punycode",
1941 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001942 "shift_jis",
1943 "shift_jis_2004",
1944 "shift_jisx0213",
1945 "tis_620",
1946 "unicode_escape",
1947 "unicode_internal",
1948 "utf_16",
1949 "utf_16_be",
1950 "utf_16_le",
1951 "utf_7",
1952 "utf_8",
1953]
1954
1955if hasattr(codecs, "mbcs_encode"):
1956 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001957if hasattr(codecs, "oem_encode"):
1958 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001959
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001960# The following encoding is not tested, because it's not supposed
1961# to work:
1962# "undefined"
1963
1964# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001965broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001966 "punycode",
1967 "unicode_internal"
1968]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001969
Victor Stinnerf96418d2015-09-21 23:06:27 +02001970
Walter Dörwald3abcb012007-04-16 22:10:50 +00001971class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001972 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001973 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001974 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001975 name = codecs.lookup(encoding).name
1976 if encoding.endswith("_codec"):
1977 name += "_codec"
1978 elif encoding == "latin_1":
1979 name = "latin_1"
1980 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001981
Ezio Melottiadc417c2011-11-17 12:23:34 +02001982 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001983 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001984 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001985 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001986 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001987 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001988
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001989 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001990 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001991 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001992 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001993 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001994 for c in s:
1995 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001996 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001997 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001998 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001999 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02002000 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002001 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002002 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002003 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002004 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002005 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002006
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002007 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002008 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002009 try:
2010 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002011 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002012 pass
2013 else:
2014 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002015 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002016 for c in s:
2017 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002018 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002019 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002020 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002021 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002022 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002023 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002024 self.assertEqual(decodedresult, s,
2025 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002026
2027 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002028 result = "".join(codecs.iterdecode(
2029 codecs.iterencode(s, encoding), encoding))
2030 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002031
2032 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002033 result = "".join(codecs.iterdecode(
2034 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002035 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002036
Victor Stinner554f3f02010-06-16 23:33:54 +00002037 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002038 # check incremental decoder/encoder with errors argument
2039 try:
2040 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002041 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002042 pass
2043 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002044 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002045 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002046 decodedresult = "".join(decoder.decode(bytes([c]))
2047 for c in encodedresult)
2048 self.assertEqual(decodedresult, s,
2049 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002050
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002051 @support.cpython_only
2052 def test_basics_capi(self):
2053 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2054 s = "abc123" # all codecs should be able to encode these
2055 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002056 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002057 # check incremental decoder/encoder (fetched via the C API)
2058 try:
2059 cencoder = codec_incrementalencoder(encoding)
2060 except LookupError: # no IncrementalEncoder
2061 pass
2062 else:
2063 # check C API
2064 encodedresult = b""
2065 for c in s:
2066 encodedresult += cencoder.encode(c)
2067 encodedresult += cencoder.encode("", True)
2068 cdecoder = codec_incrementaldecoder(encoding)
2069 decodedresult = ""
2070 for c in encodedresult:
2071 decodedresult += cdecoder.decode(bytes([c]))
2072 decodedresult += cdecoder.decode(b"", True)
2073 self.assertEqual(decodedresult, s,
2074 "encoding=%r" % encoding)
2075
2076 if encoding not in ("idna", "mbcs"):
2077 # check incremental decoder/encoder with errors argument
2078 try:
2079 cencoder = codec_incrementalencoder(encoding, "ignore")
2080 except LookupError: # no IncrementalEncoder
2081 pass
2082 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002083 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002084 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2085 decodedresult = "".join(cdecoder.decode(bytes([c]))
2086 for c in encodedresult)
2087 self.assertEqual(decodedresult, s,
2088 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002089
Walter Dörwald729c31f2005-03-14 19:06:30 +00002090 def test_seek(self):
2091 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002092 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002093 for encoding in all_unicode_encodings:
2094 if encoding == "idna": # FIXME: See SF bug #1163178
2095 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002096 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002097 continue
Victor Stinner05010702011-05-27 16:50:40 +02002098 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002099 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002100 # Test that calling seek resets the internal codec state and buffers
2101 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002102 data = reader.read()
2103 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002104
Walter Dörwalde22d3392005-11-17 08:52:34 +00002105 def test_bad_decode_args(self):
2106 for encoding in all_unicode_encodings:
2107 decoder = codecs.getdecoder(encoding)
2108 self.assertRaises(TypeError, decoder)
2109 if encoding not in ("idna", "punycode"):
2110 self.assertRaises(TypeError, decoder, 42)
2111
2112 def test_bad_encode_args(self):
2113 for encoding in all_unicode_encodings:
2114 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002115 with support.check_warnings():
2116 # unicode-internal has been deprecated
2117 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002118
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002119 def test_encoding_map_type_initialized(self):
2120 from encodings import cp1140
2121 # This used to crash, we are only verifying there's no crash.
2122 table_type = type(cp1140.encoding_table)
2123 self.assertEqual(table_type, table_type)
2124
Walter Dörwald3abcb012007-04-16 22:10:50 +00002125 def test_decoder_state(self):
2126 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002127 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002128 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002129 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002130 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2131 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2132
Victor Stinnerf96418d2015-09-21 23:06:27 +02002133
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002134class CharmapTest(unittest.TestCase):
2135 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002136 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002137 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002138 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002139 )
2140
Ezio Melottib3aedd42010-11-20 19:04:17 +00002141 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002142 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2143 ("\U0010FFFFbc", 3)
2144 )
2145
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002146 self.assertRaises(UnicodeDecodeError,
2147 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2148 )
2149
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002150 self.assertRaises(UnicodeDecodeError,
2151 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2152 )
2153
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002154 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002155 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002156 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002157 )
2158
Ezio Melottib3aedd42010-11-20 19:04:17 +00002159 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002160 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002161 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002162 )
2163
Ezio Melottib3aedd42010-11-20 19:04:17 +00002164 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002165 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2166 ("ab\\x02", 3)
2167 )
2168
2169 self.assertEqual(
2170 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2171 ("ab\\x02", 3)
2172 )
2173
2174 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002175 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002176 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002177 )
2178
Ezio Melottib3aedd42010-11-20 19:04:17 +00002179 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002180 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002181 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002182 )
2183
Guido van Rossum805365e2007-05-07 22:24:25 +00002184 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002185 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002186 codecs.charmap_decode(allbytes, "ignore", ""),
2187 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002188 )
2189
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002190 def test_decode_with_int2str_map(self):
2191 self.assertEqual(
2192 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2193 {0: 'a', 1: 'b', 2: 'c'}),
2194 ("abc", 3)
2195 )
2196
2197 self.assertEqual(
2198 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2199 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2200 ("AaBbCc", 3)
2201 )
2202
2203 self.assertEqual(
2204 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2205 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2206 ("\U0010FFFFbc", 3)
2207 )
2208
2209 self.assertEqual(
2210 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2211 {0: 'a', 1: 'b', 2: ''}),
2212 ("ab", 3)
2213 )
2214
2215 self.assertRaises(UnicodeDecodeError,
2216 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2217 {0: 'a', 1: 'b'}
2218 )
2219
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002220 self.assertRaises(UnicodeDecodeError,
2221 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2222 {0: 'a', 1: 'b', 2: None}
2223 )
2224
2225 # Issue #14850
2226 self.assertRaises(UnicodeDecodeError,
2227 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2228 {0: 'a', 1: 'b', 2: '\ufffe'}
2229 )
2230
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002231 self.assertEqual(
2232 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2233 {0: 'a', 1: 'b'}),
2234 ("ab\ufffd", 3)
2235 )
2236
2237 self.assertEqual(
2238 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2239 {0: 'a', 1: 'b', 2: None}),
2240 ("ab\ufffd", 3)
2241 )
2242
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002243 # Issue #14850
2244 self.assertEqual(
2245 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2246 {0: 'a', 1: 'b', 2: '\ufffe'}),
2247 ("ab\ufffd", 3)
2248 )
2249
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002250 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002251 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2252 {0: 'a', 1: 'b'}),
2253 ("ab\\x02", 3)
2254 )
2255
2256 self.assertEqual(
2257 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2258 {0: 'a', 1: 'b', 2: None}),
2259 ("ab\\x02", 3)
2260 )
2261
2262 # Issue #14850
2263 self.assertEqual(
2264 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2265 {0: 'a', 1: 'b', 2: '\ufffe'}),
2266 ("ab\\x02", 3)
2267 )
2268
2269 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002270 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2271 {0: 'a', 1: 'b'}),
2272 ("ab", 3)
2273 )
2274
2275 self.assertEqual(
2276 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2277 {0: 'a', 1: 'b', 2: None}),
2278 ("ab", 3)
2279 )
2280
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002281 # Issue #14850
2282 self.assertEqual(
2283 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2284 {0: 'a', 1: 'b', 2: '\ufffe'}),
2285 ("ab", 3)
2286 )
2287
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002288 allbytes = bytes(range(256))
2289 self.assertEqual(
2290 codecs.charmap_decode(allbytes, "ignore", {}),
2291 ("", len(allbytes))
2292 )
2293
2294 def test_decode_with_int2int_map(self):
2295 a = ord('a')
2296 b = ord('b')
2297 c = ord('c')
2298
2299 self.assertEqual(
2300 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2301 {0: a, 1: b, 2: c}),
2302 ("abc", 3)
2303 )
2304
2305 # Issue #15379
2306 self.assertEqual(
2307 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2308 {0: 0x10FFFF, 1: b, 2: c}),
2309 ("\U0010FFFFbc", 3)
2310 )
2311
Antoine Pitroua1f76552012-09-23 20:00:04 +02002312 self.assertEqual(
2313 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2314 {0: sys.maxunicode, 1: b, 2: c}),
2315 (chr(sys.maxunicode) + "bc", 3)
2316 )
2317
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002318 self.assertRaises(TypeError,
2319 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002320 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002321 )
2322
2323 self.assertRaises(UnicodeDecodeError,
2324 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2325 {0: a, 1: b},
2326 )
2327
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002328 self.assertRaises(UnicodeDecodeError,
2329 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2330 {0: a, 1: b, 2: 0xFFFE},
2331 )
2332
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002333 self.assertEqual(
2334 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2335 {0: a, 1: b}),
2336 ("ab\ufffd", 3)
2337 )
2338
2339 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002340 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2341 {0: a, 1: b, 2: 0xFFFE}),
2342 ("ab\ufffd", 3)
2343 )
2344
2345 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002346 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2347 {0: a, 1: b}),
2348 ("ab\\x02", 3)
2349 )
2350
2351 self.assertEqual(
2352 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2353 {0: a, 1: b, 2: 0xFFFE}),
2354 ("ab\\x02", 3)
2355 )
2356
2357 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002358 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2359 {0: a, 1: b}),
2360 ("ab", 3)
2361 )
2362
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002363 self.assertEqual(
2364 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2365 {0: a, 1: b, 2: 0xFFFE}),
2366 ("ab", 3)
2367 )
2368
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002369
Thomas Wouters89f507f2006-12-13 04:49:30 +00002370class WithStmtTest(unittest.TestCase):
2371 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002372 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002373 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2374 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002375 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002376
2377 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002378 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002379 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002380 with codecs.StreamReaderWriter(f, info.streamreader,
2381 info.streamwriter, 'strict') as srw:
2382 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002383
Victor Stinnerf96418d2015-09-21 23:06:27 +02002384
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002385class TypesTest(unittest.TestCase):
2386 def test_decode_unicode(self):
2387 # Most decoders don't accept unicode input
2388 decoders = [
2389 codecs.utf_7_decode,
2390 codecs.utf_8_decode,
2391 codecs.utf_16_le_decode,
2392 codecs.utf_16_be_decode,
2393 codecs.utf_16_ex_decode,
2394 codecs.utf_32_decode,
2395 codecs.utf_32_le_decode,
2396 codecs.utf_32_be_decode,
2397 codecs.utf_32_ex_decode,
2398 codecs.latin_1_decode,
2399 codecs.ascii_decode,
2400 codecs.charmap_decode,
2401 ]
2402 if hasattr(codecs, "mbcs_decode"):
2403 decoders.append(codecs.mbcs_decode)
2404 for decoder in decoders:
2405 self.assertRaises(TypeError, decoder, "xxx")
2406
2407 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002408 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002409 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002410 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2411 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2412 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2413 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002414
Victor Stinnere3b47152011-12-09 20:49:49 +01002415 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2416 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002417 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2418 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002419
2420 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2421 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002422 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2423 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002424
Serhiy Storchakad6793772013-01-29 10:20:44 +02002425
2426class UnicodeEscapeTest(unittest.TestCase):
2427 def test_empty(self):
2428 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2429 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2430
2431 def test_raw_encode(self):
2432 encode = codecs.unicode_escape_encode
2433 for b in range(32, 127):
2434 if b != b'\\'[0]:
2435 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2436
2437 def test_raw_decode(self):
2438 decode = codecs.unicode_escape_decode
2439 for b in range(256):
2440 if b != b'\\'[0]:
2441 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2442
2443 def test_escape_encode(self):
2444 encode = codecs.unicode_escape_encode
2445 check = coding_checker(self, encode)
2446 check('\t', br'\t')
2447 check('\n', br'\n')
2448 check('\r', br'\r')
2449 check('\\', br'\\')
2450 for b in range(32):
2451 if chr(b) not in '\t\n\r':
2452 check(chr(b), ('\\x%02x' % b).encode())
2453 for b in range(127, 256):
2454 check(chr(b), ('\\x%02x' % b).encode())
2455 check('\u20ac', br'\u20ac')
2456 check('\U0001d120', br'\U0001d120')
2457
2458 def test_escape_decode(self):
2459 decode = codecs.unicode_escape_decode
2460 check = coding_checker(self, decode)
2461 check(b"[\\\n]", "[]")
2462 check(br'[\"]', '["]')
2463 check(br"[\']", "[']")
2464 check(br"[\\]", r"[\]")
2465 check(br"[\a]", "[\x07]")
2466 check(br"[\b]", "[\x08]")
2467 check(br"[\t]", "[\x09]")
2468 check(br"[\n]", "[\x0a]")
2469 check(br"[\v]", "[\x0b]")
2470 check(br"[\f]", "[\x0c]")
2471 check(br"[\r]", "[\x0d]")
2472 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002473 check(br"[\78]", "[\x078]")
2474 check(br"[\41]", "[!]")
2475 check(br"[\418]", "[!8]")
2476 check(br"[\101]", "[A]")
2477 check(br"[\1010]", "[A0]")
2478 check(br"[\x41]", "[A]")
2479 check(br"[\x410]", "[A0]")
2480 check(br"\u20ac", "\u20ac")
2481 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002482 for i in range(97, 123):
2483 b = bytes([i])
2484 if b not in b'abfnrtuvx':
2485 with self.assertWarns(DeprecationWarning):
2486 check(b"\\" + b, "\\" + chr(i))
2487 if b.upper() not in b'UN':
2488 with self.assertWarns(DeprecationWarning):
2489 check(b"\\" + b.upper(), "\\" + chr(i-32))
2490 with self.assertWarns(DeprecationWarning):
2491 check(br"\8", "\\8")
2492 with self.assertWarns(DeprecationWarning):
2493 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002494 with self.assertWarns(DeprecationWarning):
2495 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002496
2497 def test_decode_errors(self):
2498 decode = codecs.unicode_escape_decode
2499 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2500 for i in range(d):
2501 self.assertRaises(UnicodeDecodeError, decode,
2502 b"\\" + c + b"0"*i)
2503 self.assertRaises(UnicodeDecodeError, decode,
2504 b"[\\" + c + b"0"*i + b"]")
2505 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2506 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2507 self.assertEqual(decode(data, "replace"),
2508 ("[\ufffd]\ufffd", len(data)))
2509 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2510 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2511 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2512
2513
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002514class RawUnicodeEscapeTest(unittest.TestCase):
2515 def test_empty(self):
2516 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2517 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2518
2519 def test_raw_encode(self):
2520 encode = codecs.raw_unicode_escape_encode
2521 for b in range(256):
2522 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2523
2524 def test_raw_decode(self):
2525 decode = codecs.raw_unicode_escape_decode
2526 for b in range(256):
2527 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2528
2529 def test_escape_encode(self):
2530 encode = codecs.raw_unicode_escape_encode
2531 check = coding_checker(self, encode)
2532 for b in range(256):
2533 if b not in b'uU':
2534 check('\\' + chr(b), b'\\' + bytes([b]))
2535 check('\u20ac', br'\u20ac')
2536 check('\U0001d120', br'\U0001d120')
2537
2538 def test_escape_decode(self):
2539 decode = codecs.raw_unicode_escape_decode
2540 check = coding_checker(self, decode)
2541 for b in range(256):
2542 if b not in b'uU':
2543 check(b'\\' + bytes([b]), '\\' + chr(b))
2544 check(br"\u20ac", "\u20ac")
2545 check(br"\U0001d120", "\U0001d120")
2546
2547 def test_decode_errors(self):
2548 decode = codecs.raw_unicode_escape_decode
2549 for c, d in (b'u', 4), (b'U', 4):
2550 for i in range(d):
2551 self.assertRaises(UnicodeDecodeError, decode,
2552 b"\\" + c + b"0"*i)
2553 self.assertRaises(UnicodeDecodeError, decode,
2554 b"[\\" + c + b"0"*i + b"]")
2555 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2556 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2557 self.assertEqual(decode(data, "replace"),
2558 ("[\ufffd]\ufffd", len(data)))
2559 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2560 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2561 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2562
2563
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002564class EscapeEncodeTest(unittest.TestCase):
2565
2566 def test_escape_encode(self):
2567 tests = [
2568 (b'', (b'', 0)),
2569 (b'foobar', (b'foobar', 6)),
2570 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2571 (b'a\'b', (b"a\\'b", 3)),
2572 (b'b\\c', (b'b\\\\c', 3)),
2573 (b'c\nd', (b'c\\nd', 3)),
2574 (b'd\re', (b'd\\re', 3)),
2575 (b'f\x7fg', (b'f\\x7fg', 3)),
2576 ]
2577 for data, output in tests:
2578 with self.subTest(data=data):
2579 self.assertEqual(codecs.escape_encode(data), output)
2580 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2581 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2582
2583
Martin v. Löwis43c57782009-05-10 08:15:24 +00002584class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002585
2586 def test_utf8(self):
2587 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002588 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002589 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002590 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002591 b"foo\x80bar")
2592 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002593 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002594 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002595 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002596 b"\xed\xb0\x80")
2597
2598 def test_ascii(self):
2599 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002600 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002601 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002602 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002603 b"foo\x80bar")
2604
2605 def test_charmap(self):
2606 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002607 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002608 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002609 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002610 b"foo\xa5bar")
2611
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002612 def test_latin1(self):
2613 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002614 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002615 b"\xe4\xeb\xef\xf6\xfc")
2616
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002617
Victor Stinner3fed0872010-05-22 02:16:27 +00002618class BomTest(unittest.TestCase):
2619 def test_seek0(self):
2620 data = "1234567890"
2621 tests = ("utf-16",
2622 "utf-16-le",
2623 "utf-16-be",
2624 "utf-32",
2625 "utf-32-le",
2626 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002627 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002628 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002629 # Check if the BOM is written only once
2630 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002631 f.write(data)
2632 f.write(data)
2633 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002634 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002635 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002636 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002637
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002638 # Check that the BOM is written after a seek(0)
2639 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2640 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002641 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002642 f.seek(0)
2643 f.write(data)
2644 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002645 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002646
2647 # (StreamWriter) Check that the BOM is written after a seek(0)
2648 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002649 f.writer.write(data[0])
2650 self.assertNotEqual(f.writer.tell(), 0)
2651 f.writer.seek(0)
2652 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002653 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002654 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002655
Victor Stinner05010702011-05-27 16:50:40 +02002656 # Check that the BOM is not written after a seek() at a position
2657 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002658 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2659 f.write(data)
2660 f.seek(f.tell())
2661 f.write(data)
2662 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002663 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002664
Victor Stinner05010702011-05-27 16:50:40 +02002665 # (StreamWriter) Check that the BOM is not written after a seek()
2666 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002667 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002668 f.writer.write(data)
2669 f.writer.seek(f.writer.tell())
2670 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002671 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002672 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002673
Victor Stinner3fed0872010-05-22 02:16:27 +00002674
Georg Brandl02524622010-12-02 18:06:51 +00002675bytes_transform_encodings = [
2676 "base64_codec",
2677 "uu_codec",
2678 "quopri_codec",
2679 "hex_codec",
2680]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002681
2682transform_aliases = {
2683 "base64_codec": ["base64", "base_64"],
2684 "uu_codec": ["uu"],
2685 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2686 "hex_codec": ["hex"],
2687 "rot_13": ["rot13"],
2688}
2689
Georg Brandl02524622010-12-02 18:06:51 +00002690try:
2691 import zlib
2692except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002693 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002694else:
2695 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002696 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002697try:
2698 import bz2
2699except ImportError:
2700 pass
2701else:
2702 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002703 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002704
Victor Stinnerf96418d2015-09-21 23:06:27 +02002705
Georg Brandl02524622010-12-02 18:06:51 +00002706class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002707
Georg Brandl02524622010-12-02 18:06:51 +00002708 def test_basics(self):
2709 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002710 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002711 with self.subTest(encoding=encoding):
2712 # generic codecs interface
2713 (o, size) = codecs.getencoder(encoding)(binput)
2714 self.assertEqual(size, len(binput))
2715 (i, size) = codecs.getdecoder(encoding)(o)
2716 self.assertEqual(size, len(o))
2717 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002718
Georg Brandl02524622010-12-02 18:06:51 +00002719 def test_read(self):
2720 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002721 with self.subTest(encoding=encoding):
2722 sin = codecs.encode(b"\x80", encoding)
2723 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2724 sout = reader.read()
2725 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002726
2727 def test_readline(self):
2728 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002729 with self.subTest(encoding=encoding):
2730 sin = codecs.encode(b"\x80", encoding)
2731 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2732 sout = reader.readline()
2733 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002734
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002735 def test_buffer_api_usage(self):
2736 # We check all the transform codecs accept memoryview input
2737 # for encoding and decoding
2738 # and also that they roundtrip correctly
2739 original = b"12345\x80"
2740 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002741 with self.subTest(encoding=encoding):
2742 data = original
2743 view = memoryview(data)
2744 data = codecs.encode(data, encoding)
2745 view_encoded = codecs.encode(view, encoding)
2746 self.assertEqual(view_encoded, data)
2747 view = memoryview(data)
2748 data = codecs.decode(data, encoding)
2749 self.assertEqual(data, original)
2750 view_decoded = codecs.decode(view, encoding)
2751 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002752
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002753 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002754 # Check binary -> binary codecs give a good error for str input
2755 bad_input = "bad input type"
2756 for encoding in bytes_transform_encodings:
2757 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002758 fmt = (r"{!r} is not a text encoding; "
2759 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002760 msg = fmt.format(encoding)
2761 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002762 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002763 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002764
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002765 def test_text_to_binary_blacklists_text_transforms(self):
2766 # Check str.encode gives a good error message for str -> str codecs
2767 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002768 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002769 with self.assertRaisesRegex(LookupError, msg):
2770 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002771
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002772 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002773 # Check bytes.decode and bytearray.decode give a good error
2774 # message for binary -> binary codecs
2775 data = b"encode first to ensure we meet any format restrictions"
2776 for encoding in bytes_transform_encodings:
2777 with self.subTest(encoding=encoding):
2778 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002779 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002780 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002781 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002782 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002783 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002784 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002785 bytearray(encoded_data).decode(encoding)
2786
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002787 def test_binary_to_text_blacklists_text_transforms(self):
2788 # Check str -> str codec gives a good error for binary input
2789 for bad_input in (b"immutable", bytearray(b"mutable")):
2790 with self.subTest(bad_input=bad_input):
2791 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002792 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002793 with self.assertRaisesRegex(LookupError, msg) as failure:
2794 bad_input.decode("rot_13")
2795 self.assertIsNone(failure.exception.__cause__)
2796
Zachary Wareefa2e042013-12-30 14:54:11 -06002797 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002798 def test_custom_zlib_error_is_wrapped(self):
2799 # Check zlib codec gives a good error for malformed input
2800 msg = "^decoding with 'zlib_codec' codec failed"
2801 with self.assertRaisesRegex(Exception, msg) as failure:
2802 codecs.decode(b"hello", "zlib_codec")
2803 self.assertIsInstance(failure.exception.__cause__,
2804 type(failure.exception))
2805
2806 def test_custom_hex_error_is_wrapped(self):
2807 # Check hex codec gives a good error for malformed input
2808 msg = "^decoding with 'hex_codec' codec failed"
2809 with self.assertRaisesRegex(Exception, msg) as failure:
2810 codecs.decode(b"hello", "hex_codec")
2811 self.assertIsInstance(failure.exception.__cause__,
2812 type(failure.exception))
2813
2814 # Unfortunately, the bz2 module throws OSError, which the codec
2815 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002816
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002817 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2818 def test_aliases(self):
2819 for codec_name, aliases in transform_aliases.items():
2820 expected_name = codecs.lookup(codec_name).name
2821 for alias in aliases:
2822 with self.subTest(alias=alias):
2823 info = codecs.lookup(alias)
2824 self.assertEqual(info.name, expected_name)
2825
Martin Panter06171bd2015-09-12 00:34:28 +00002826 def test_quopri_stateless(self):
2827 # Should encode with quotetabs=True
2828 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2829 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2830 # But should still support unescaped tabs and spaces
2831 unescaped = b"space tab eol\n"
2832 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2833
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002834 def test_uu_invalid(self):
2835 # Missing "begin" line
2836 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2837
Nick Coghlan8b097b42013-11-13 23:49:21 +10002838
2839# The codec system tries to wrap exceptions in order to ensure the error
2840# mentions the operation being performed and the codec involved. We
2841# currently *only* want this to happen for relatively stateless
2842# exceptions, where the only significant information they contain is their
2843# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002844
2845# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002846# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002847_TEST_CODECS = {}
2848
2849def _get_test_codec(codec_name):
2850 return _TEST_CODECS.get(codec_name)
2851codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2852
Nick Coghlan8fad1672014-09-15 23:50:44 +12002853try:
2854 # Issue #22166: Also need to clear the internal cache in CPython
2855 from _codecs import _forget_codec
2856except ImportError:
2857 def _forget_codec(codec_name):
2858 pass
2859
2860
Nick Coghlan8b097b42013-11-13 23:49:21 +10002861class ExceptionChainingTest(unittest.TestCase):
2862
2863 def setUp(self):
2864 # There's no way to unregister a codec search function, so we just
2865 # ensure we render this one fairly harmless after the test
2866 # case finishes by using the test case repr as the codec name
2867 # The codecs module normalizes codec names, although this doesn't
2868 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002869 # We also make sure we use a truly unique id for the custom codec
2870 # to avoid issues with the codec cache when running these tests
2871 # multiple times (e.g. when hunting for refleaks)
2872 unique_id = repr(self) + str(id(self))
2873 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2874
2875 # We store the object to raise on the instance because of a bad
2876 # interaction between the codec caching (which means we can't
2877 # recreate the codec entry) and regrtest refleak hunting (which
2878 # runs the same test instance multiple times). This means we
2879 # need to ensure the codecs call back in to the instance to find
2880 # out which exception to raise rather than binding them in a
2881 # closure to an object that may change on the next run
2882 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002883
Nick Coghlan4e553e22013-11-16 00:35:34 +10002884 def tearDown(self):
2885 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002886 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2887 encodings._cache.pop(self.codec_name, None)
2888 try:
2889 _forget_codec(self.codec_name)
2890 except KeyError:
2891 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002892
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002893 def set_codec(self, encode, decode):
2894 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002895 name=self.codec_name)
2896 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002897
2898 @contextlib.contextmanager
2899 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002900 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002901 operation, self.codec_name, exc_type.__name__, msg)
2902 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2903 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002904 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002905 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002906
2907 def raise_obj(self, *args, **kwds):
2908 # Helper to dynamically change the object raised by a test codec
2909 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002910
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002911 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002912 self.obj_to_raise = obj_to_raise
2913 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002914 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002915 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002916 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002917 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002918 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002919 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002920 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002921 codecs.decode(b"bytes input", self.codec_name)
2922
2923 def test_raise_by_type(self):
2924 self.check_wrapped(RuntimeError, "")
2925
2926 def test_raise_by_value(self):
2927 msg = "This should be wrapped"
2928 self.check_wrapped(RuntimeError(msg), msg)
2929
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002930 def test_raise_grandchild_subclass_exact_size(self):
2931 msg = "This should be wrapped"
2932 class MyRuntimeError(RuntimeError):
2933 __slots__ = ()
2934 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2935
2936 def test_raise_subclass_with_weakref_support(self):
2937 msg = "This should be wrapped"
2938 class MyRuntimeError(RuntimeError):
2939 pass
2940 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2941
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002942 def check_not_wrapped(self, obj_to_raise, msg):
2943 def raise_obj(*args, **kwds):
2944 raise obj_to_raise
2945 self.set_codec(raise_obj, raise_obj)
2946 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002947 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002948 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002949 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002950 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002951 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002952 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002953 codecs.decode(b"bytes input", self.codec_name)
2954
2955 def test_init_override_is_not_wrapped(self):
2956 class CustomInit(RuntimeError):
2957 def __init__(self):
2958 pass
2959 self.check_not_wrapped(CustomInit, "")
2960
2961 def test_new_override_is_not_wrapped(self):
2962 class CustomNew(RuntimeError):
2963 def __new__(cls):
2964 return super().__new__(cls)
2965 self.check_not_wrapped(CustomNew, "")
2966
2967 def test_instance_attribute_is_not_wrapped(self):
2968 msg = "This should NOT be wrapped"
2969 exc = RuntimeError(msg)
2970 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002971 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002972
2973 def test_non_str_arg_is_not_wrapped(self):
2974 self.check_not_wrapped(RuntimeError(1), "1")
2975
2976 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002977 msg_re = r"^\('a', 'b', 'c'\)$"
2978 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002979
2980 # http://bugs.python.org/issue19609
2981 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002982 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002983 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002984 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002985 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002986 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002987 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002988 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002989 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002990 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002991 codecs.decode(b"bytes input", self.codec_name)
2992
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002993 def test_unflagged_non_text_codec_handling(self):
2994 # The stdlib non-text codecs are now marked so they're
2995 # pre-emptively skipped by the text model related methods
2996 # However, third party codecs won't be flagged, so we still make
2997 # sure the case where an inappropriate output type is produced is
2998 # handled appropriately
2999 def encode_to_str(*args, **kwds):
3000 return "not bytes!", 0
3001 def decode_to_bytes(*args, **kwds):
3002 return b"not str!", 0
3003 self.set_codec(encode_to_str, decode_to_bytes)
3004 # No input or output type checks on the codecs module functions
3005 encoded = codecs.encode(None, self.codec_name)
3006 self.assertEqual(encoded, "not bytes!")
3007 decoded = codecs.decode(None, self.codec_name)
3008 self.assertEqual(decoded, b"not str!")
3009 # Text model methods should complain
3010 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04003011 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003012 msg = fmt.format(self.codec_name)
3013 with self.assertRaisesRegex(TypeError, msg):
3014 "str_input".encode(self.codec_name)
3015 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003016 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003017 msg = fmt.format(self.codec_name)
3018 with self.assertRaisesRegex(TypeError, msg):
3019 b"bytes input".decode(self.codec_name)
3020
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003021
Georg Brandl02524622010-12-02 18:06:51 +00003022
Victor Stinner62be4fb2011-10-18 21:46:37 +02003023@unittest.skipUnless(sys.platform == 'win32',
3024 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003025class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003026 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003027 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003028
Victor Stinner3a50e702011-10-18 21:21:00 +02003029 def test_invalid_code_page(self):
3030 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3031 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003032 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3033 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003034
3035 def test_code_page_name(self):
3036 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3037 codecs.code_page_encode, 932, '\xff')
3038 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003039 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003040 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003041 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003042
3043 def check_decode(self, cp, tests):
3044 for raw, errors, expected in tests:
3045 if expected is not None:
3046 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003047 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003048 except UnicodeDecodeError as err:
3049 self.fail('Unable to decode %a from "cp%s" with '
3050 'errors=%r: %s' % (raw, cp, errors, err))
3051 self.assertEqual(decoded[0], expected,
3052 '%a.decode("cp%s", %r)=%a != %a'
3053 % (raw, cp, errors, decoded[0], expected))
3054 # assert 0 <= decoded[1] <= len(raw)
3055 self.assertGreaterEqual(decoded[1], 0)
3056 self.assertLessEqual(decoded[1], len(raw))
3057 else:
3058 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003059 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003060
3061 def check_encode(self, cp, tests):
3062 for text, errors, expected in tests:
3063 if expected is not None:
3064 try:
3065 encoded = codecs.code_page_encode(cp, text, errors)
3066 except UnicodeEncodeError as err:
3067 self.fail('Unable to encode %a to "cp%s" with '
3068 'errors=%r: %s' % (text, cp, errors, err))
3069 self.assertEqual(encoded[0], expected,
3070 '%a.encode("cp%s", %r)=%a != %a'
3071 % (text, cp, errors, encoded[0], expected))
3072 self.assertEqual(encoded[1], len(text))
3073 else:
3074 self.assertRaises(UnicodeEncodeError,
3075 codecs.code_page_encode, cp, text, errors)
3076
3077 def test_cp932(self):
3078 self.check_encode(932, (
3079 ('abc', 'strict', b'abc'),
3080 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003081 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003082 ('\xff', 'strict', None),
3083 ('[\xff]', 'ignore', b'[]'),
3084 ('[\xff]', 'replace', b'[y]'),
3085 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003086 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003087 ('[\xff]', 'namereplace',
3088 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003089 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003090 ('\udcff', 'strict', None),
3091 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3092 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003093 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003094 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003095 (b'abc', 'strict', 'abc'),
3096 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3097 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003098 (b'[\xff]', 'strict', None),
3099 (b'[\xff]', 'ignore', '[]'),
3100 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003101 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003102 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003103 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003104 (b'\x81\x00abc', 'strict', None),
3105 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003106 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003107 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003108 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003109
3110 def test_cp1252(self):
3111 self.check_encode(1252, (
3112 ('abc', 'strict', b'abc'),
3113 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3114 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003115 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003116 ('\u0141', 'strict', None),
3117 ('\u0141', 'ignore', b''),
3118 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003119 ('\udc98', 'surrogateescape', b'\x98'),
3120 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003121 ))
3122 self.check_decode(1252, (
3123 (b'abc', 'strict', 'abc'),
3124 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3125 (b'\xff', 'strict', '\xff'),
3126 ))
3127
3128 def test_cp_utf7(self):
3129 cp = 65000
3130 self.check_encode(cp, (
3131 ('abc', 'strict', b'abc'),
3132 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3133 ('\U0010ffff', 'strict', b'+2//f/w-'),
3134 ('\udc80', 'strict', b'+3IA-'),
3135 ('\ufffd', 'strict', b'+//0-'),
3136 ))
3137 self.check_decode(cp, (
3138 (b'abc', 'strict', 'abc'),
3139 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3140 (b'+2//f/w-', 'strict', '\U0010ffff'),
3141 (b'+3IA-', 'strict', '\udc80'),
3142 (b'+//0-', 'strict', '\ufffd'),
3143 # invalid bytes
3144 (b'[+/]', 'strict', '[]'),
3145 (b'[\xff]', 'strict', '[\xff]'),
3146 ))
3147
Victor Stinner3a50e702011-10-18 21:21:00 +02003148 def test_multibyte_encoding(self):
3149 self.check_decode(932, (
3150 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3151 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3152 ))
3153 self.check_decode(self.CP_UTF8, (
3154 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3155 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3156 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003157 self.check_encode(self.CP_UTF8, (
3158 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3159 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3160 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003161
3162 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003163 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3164 self.assertEqual(decoded, ('', 0))
3165
Victor Stinner3a50e702011-10-18 21:21:00 +02003166 decoded = codecs.code_page_decode(932,
3167 b'\xe9\x80\xe9', 'strict',
3168 False)
3169 self.assertEqual(decoded, ('\u9a3e', 2))
3170
3171 decoded = codecs.code_page_decode(932,
3172 b'\xe9\x80\xe9\x80', 'strict',
3173 False)
3174 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3175
3176 decoded = codecs.code_page_decode(932,
3177 b'abc', 'strict',
3178 False)
3179 self.assertEqual(decoded, ('abc', 3))
3180
Steve Dowerf5aba582016-09-06 19:42:27 -07003181 def test_mbcs_alias(self):
3182 # Check that looking up our 'default' codepage will return
3183 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003184 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003185 codec = codecs.lookup('cp123')
3186 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003187
Victor Stinner3a50e702011-10-18 21:21:00 +02003188
Victor Stinnerf96418d2015-09-21 23:06:27 +02003189class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003190 def test_encode(self):
3191 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3192
3193 def test_encode_error(self):
3194 for data, error_handler, expected in (
3195 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3196 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3197 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003198 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3199 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003200 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3201 ):
3202 with self.subTest(data=data, error_handler=error_handler,
3203 expected=expected):
3204 self.assertEqual(data.encode('ascii', error_handler),
3205 expected)
3206
3207 def test_encode_surrogateescape_error(self):
3208 with self.assertRaises(UnicodeEncodeError):
3209 # the first character can be decoded, but not the second
3210 '\udc80\xff'.encode('ascii', 'surrogateescape')
3211
Victor Stinnerf96418d2015-09-21 23:06:27 +02003212 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003213 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3214
3215 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003216 for data, error_handler, expected in (
3217 (b'[\x80\xff]', 'ignore', '[]'),
3218 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3219 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3220 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3221 ):
3222 with self.subTest(data=data, error_handler=error_handler,
3223 expected=expected):
3224 self.assertEqual(data.decode('ascii', error_handler),
3225 expected)
3226
3227
Victor Stinnerc3713e92015-09-29 12:32:13 +02003228class Latin1Test(unittest.TestCase):
3229 def test_encode(self):
3230 for data, expected in (
3231 ('abc', b'abc'),
3232 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3233 ):
3234 with self.subTest(data=data, expected=expected):
3235 self.assertEqual(data.encode('latin1'), expected)
3236
3237 def test_encode_errors(self):
3238 for data, error_handler, expected in (
3239 ('[\u20ac\udc80]', 'ignore', b'[]'),
3240 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003241 ('[\u20ac\U000abcde]', 'backslashreplace',
3242 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003243 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3244 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3245 ):
3246 with self.subTest(data=data, error_handler=error_handler,
3247 expected=expected):
3248 self.assertEqual(data.encode('latin1', error_handler),
3249 expected)
3250
3251 def test_encode_surrogateescape_error(self):
3252 with self.assertRaises(UnicodeEncodeError):
3253 # the first character can be decoded, but not the second
3254 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3255
3256 def test_decode(self):
3257 for data, expected in (
3258 (b'abc', 'abc'),
3259 (b'[\x80\xff]', '[\x80\xff]'),
3260 ):
3261 with self.subTest(data=data, expected=expected):
3262 self.assertEqual(data.decode('latin1'), expected)
3263
3264
Fred Drake2e2be372001-09-20 21:33:42 +00003265if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003266 unittest.main()