blob: 00b5d317c4013e6693f3865d35334351f45f6ee0 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner91106cd2017-12-13 12:29:09 +01008from unittest import mock
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
Victor Stinner3d4226a2018-08-29 22:21:32 +020013 import _testcapi
14except ImportError as exc:
15 _testcapi = None
16
17try:
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Victor Stinnerf96418d2015-09-21 23:06:27 +020030
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Victor Stinnerf96418d2015-09-21 23:06:27 +020051
Walter Dörwald3abcb012007-04-16 22:10:50 +000052class MixInCheckStateHandling:
53 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000054 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 d = codecs.getincrementaldecoder(encoding)()
56 part1 = d.decode(s[:i])
57 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000058 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000059 # Check that the condition stated in the documentation for
60 # IncrementalDecoder.getstate() holds
61 if not state[1]:
62 # reset decoder to the default state without anything buffered
63 d.setstate((state[0][:0], 0))
64 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000065 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000066 # The decoder must return to the same state
67 self.assertEqual(state, d.getstate())
68 # Create a new decoder and set it to the state
69 # we extracted from the old one
70 d = codecs.getincrementaldecoder(encoding)()
71 d.setstate(state)
72 part2 = d.decode(s[i:], True)
73 self.assertEqual(u, part1+part2)
74
75 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000076 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000077 d = codecs.getincrementalencoder(encoding)()
78 part1 = d.encode(u[:i])
79 state = d.getstate()
80 d = codecs.getincrementalencoder(encoding)()
81 d.setstate(state)
82 part2 = d.encode(u[i:], True)
83 self.assertEqual(s, part1+part2)
84
Victor Stinnerf96418d2015-09-21 23:06:27 +020085
Ezio Melotti5d3dba02013-01-11 06:02:07 +020086class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000088 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000089 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000090 # the StreamReader and check that the results equal the appropriate
91 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020093 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000097 result += r.read()
98 self.assertEqual(result, partialresult)
99 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000102
Martin Panter7462b6492015-11-02 03:37:02 +0000103 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000105 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 self.assertEqual(result, partialresult)
109 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 self.assertEqual(d.decode(b"", True), "")
111 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000115 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000120 self.assertEqual(d.decode(b"", True), "")
121 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000122
123 # check iterdecode()
124 encoded = input.encode(self.encoding)
125 self.assertEqual(
126 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 )
129
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 def test_readline(self):
131 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000132 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 return codecs.getreader(self.encoding)(stream)
134
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200136 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 lines = []
138 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140 if not line:
141 break
142 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000144
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
146 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
147 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000148 self.assertEqual(readalllines(s, True), sexpected)
149 self.assertEqual(readalllines(s, False), sexpectednoends)
150 self.assertEqual(readalllines(s, True, 10), sexpected)
151 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200153 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000154 # Test long lines (multiple calls to read() in readline())
155 vw = []
156 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200157 for (i, lineend) in enumerate(lineends):
158 vw.append((i*200+200)*"\u3042" + lineend)
159 vwo.append((i*200+200)*"\u3042")
160 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
161 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000162
163 # Test lines where the first read might end with \r, so the
164 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200166 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000169 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 self.assertEqual(
171 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000172 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200174 self.assertEqual(
175 reader.readline(keepends=True),
176 "xxx\n",
177 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000179 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 self.assertEqual(
181 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000182 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200184 self.assertEqual(
185 reader.readline(keepends=False),
186 "xxx",
187 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200189 def test_mixed_readline_and_read(self):
190 lines = ["Humpty Dumpty sat on a wall,\n",
191 "Humpty Dumpty had a great fall.\r\n",
192 "All the king's horses and all the king's men\r",
193 "Couldn't put Humpty together again."]
194 data = ''.join(lines)
195 def getreader():
196 stream = io.BytesIO(data.encode(self.encoding))
197 return codecs.getreader(self.encoding)(stream)
198
199 # Issue #8260: Test readline() followed by read()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.read(), ''.join(lines[1:]))
203 self.assertEqual(f.read(), '')
204
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200205 # Issue #32110: Test readline() followed by read(n)
206 f = getreader()
207 self.assertEqual(f.readline(), lines[0])
208 self.assertEqual(f.read(1), lines[1][0])
209 self.assertEqual(f.read(0), '')
210 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
211
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200212 # Issue #16636: Test readline() followed by readlines()
213 f = getreader()
214 self.assertEqual(f.readline(), lines[0])
215 self.assertEqual(f.readlines(), lines[1:])
216 self.assertEqual(f.read(), '')
217
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200218 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200219 f = getreader()
220 self.assertEqual(f.read(size=40, chars=5), data[:5])
221 self.assertEqual(f.read(), data[5:])
222 self.assertEqual(f.read(), '')
223
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200224 # Issue #32110: Test read(n) followed by read(n)
225 f = getreader()
226 self.assertEqual(f.read(size=40, chars=5), data[:5])
227 self.assertEqual(f.read(1), data[5])
228 self.assertEqual(f.read(0), '')
229 self.assertEqual(f.read(100), data[6:106])
230
231 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200232 f = getreader()
233 self.assertEqual(f.read(size=40, chars=5), data[:5])
234 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
235 self.assertEqual(f.read(), '')
236
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 def test_bug1175396(self):
238 s = [
239 '<%!--===================================================\r\n',
240 ' BLOG index page: show recent articles,\r\n',
241 ' today\'s articles, or articles of a specific date.\r\n',
242 '========================================================--%>\r\n',
243 '<%@inputencoding="ISO-8859-1"%>\r\n',
244 '<%@pagetemplate=TEMPLATE.y%>\r\n',
245 '<%@import=import frog.util, frog%>\r\n',
246 '<%@import=import frog.objects%>\r\n',
247 '<%@import=from frog.storageerrors import StorageError%>\r\n',
248 '<%\r\n',
249 '\r\n',
250 'import logging\r\n',
251 'log=logging.getLogger("Snakelets.logger")\r\n',
252 '\r\n',
253 '\r\n',
254 'user=self.SessionCtx.user\r\n',
255 'storageEngine=self.SessionCtx.storageEngine\r\n',
256 '\r\n',
257 '\r\n',
258 'def readArticlesFromDate(date, count=None):\r\n',
259 ' entryids=storageEngine.listBlogEntries(date)\r\n',
260 ' entryids.reverse() # descending\r\n',
261 ' if count:\r\n',
262 ' entryids=entryids[:count]\r\n',
263 ' try:\r\n',
264 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
265 ' except StorageError,x:\r\n',
266 ' log.error("Error loading articles: "+str(x))\r\n',
267 ' self.abort("cannot load articles")\r\n',
268 '\r\n',
269 'showdate=None\r\n',
270 '\r\n',
271 'arg=self.Request.getArg()\r\n',
272 'if arg=="today":\r\n',
273 ' #-------------------- TODAY\'S ARTICLES\r\n',
274 ' self.write("<h2>Today\'s articles</h2>")\r\n',
275 ' showdate = frog.util.isodatestr() \r\n',
276 ' entries = readArticlesFromDate(showdate)\r\n',
277 'elif arg=="active":\r\n',
278 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
279 ' self.Yredirect("active.y")\r\n',
280 'elif arg=="login":\r\n',
281 ' #-------------------- LOGIN PAGE redirect\r\n',
282 ' self.Yredirect("login.y")\r\n',
283 'elif arg=="date":\r\n',
284 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
285 ' showdate = self.Request.getParameter("date")\r\n',
286 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
287 ' entries = readArticlesFromDate(showdate)\r\n',
288 'else:\r\n',
289 ' #-------------------- RECENT ARTICLES\r\n',
290 ' self.write("<h2>Recent articles</h2>")\r\n',
291 ' dates=storageEngine.listBlogEntryDates()\r\n',
292 ' if dates:\r\n',
293 ' entries=[]\r\n',
294 ' SHOWAMOUNT=10\r\n',
295 ' for showdate in dates:\r\n',
296 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
297 ' if len(entries)>=SHOWAMOUNT:\r\n',
298 ' break\r\n',
299 ' \r\n',
300 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000301 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200302 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000303 for (i, line) in enumerate(reader):
304 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000307 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200308 writer = codecs.getwriter(self.encoding)(q)
309 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000310
311 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000312 writer.write("foo\r")
313 self.assertEqual(reader.readline(keepends=False), "foo")
314 writer.write("\nbar\r")
315 self.assertEqual(reader.readline(keepends=False), "")
316 self.assertEqual(reader.readline(keepends=False), "bar")
317 writer.write("baz")
318 self.assertEqual(reader.readline(keepends=False), "baz")
319 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000320
321 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 writer.write("foo\r")
323 self.assertEqual(reader.readline(keepends=True), "foo\r")
324 writer.write("\nbar\r")
325 self.assertEqual(reader.readline(keepends=True), "\n")
326 self.assertEqual(reader.readline(keepends=True), "bar\r")
327 writer.write("baz")
328 self.assertEqual(reader.readline(keepends=True), "baz")
329 self.assertEqual(reader.readline(keepends=True), "")
330 writer.write("foo\r\n")
331 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332
Walter Dörwald9fa09462005-01-10 12:01:39 +0000333 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000334 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
335 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
336 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337
338 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000339 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200340 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000341 self.assertEqual(reader.readline(), s1)
342 self.assertEqual(reader.readline(), s2)
343 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
346 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000347 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
348 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
349 s3 = "stillokay:bbbbxx\r\n"
350 s4 = "broken!!!!badbad\r\n"
351 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000352
353 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000354 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200355 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000356 self.assertEqual(reader.readline(), s1)
357 self.assertEqual(reader.readline(), s2)
358 self.assertEqual(reader.readline(), s3)
359 self.assertEqual(reader.readline(), s4)
360 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000361 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000362
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200363 ill_formed_sequence_replace = "\ufffd"
364
365 def test_lone_surrogates(self):
366 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
367 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
368 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200369 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
370 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200371 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
372 "[&#56448;]".encode(self.encoding))
373 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
374 "[]".encode(self.encoding))
375 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
376 "[?]".encode(self.encoding))
377
Victor Stinner01ada392015-10-01 21:54:51 +0200378 # sequential surrogate characters
379 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
380 "[]".encode(self.encoding))
381 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
382 "[??]".encode(self.encoding))
383
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200384 bom = "".encode(self.encoding)
385 for before, after in [("\U00010fff", "A"), ("[", "]"),
386 ("A", "\U00010fff")]:
387 before_sequence = before.encode(self.encoding)[len(bom):]
388 after_sequence = after.encode(self.encoding)[len(bom):]
389 test_string = before + "\uDC80" + after
390 test_sequence = (bom + before_sequence +
391 self.ill_formed_sequence + after_sequence)
392 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
393 self.encoding)
394 self.assertEqual(test_string.encode(self.encoding,
395 "surrogatepass"),
396 test_sequence)
397 self.assertEqual(test_sequence.decode(self.encoding,
398 "surrogatepass"),
399 test_string)
400 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
401 before + after)
402 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
403 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200404 backslashreplace = ''.join('\\x%02x' % b
405 for b in self.ill_formed_sequence)
406 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
407 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200408
Victor Stinnerf96418d2015-09-21 23:06:27 +0200409
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200410class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000411 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200412 if sys.byteorder == 'little':
413 ill_formed_sequence = b"\x80\xdc\x00\x00"
414 else:
415 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416
417 spamle = (b'\xff\xfe\x00\x00'
418 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
419 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
420 spambe = (b'\x00\x00\xfe\xff'
421 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
422 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
423
424 def test_only_one_bom(self):
425 _,_,reader,writer = codecs.lookup(self.encoding)
426 # encode some stream
427 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200428 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000429 f.write("spam")
430 f.write("spam")
431 d = s.getvalue()
432 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000433 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000434 # try to read it back
435 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200436 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000437 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438
439 def test_badbom(self):
440 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200441 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 self.assertRaises(UnicodeError, f.read)
443
444 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200445 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 self.assertRaises(UnicodeError, f.read)
447
448 def test_partial(self):
449 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200450 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000451 [
452 "", # first byte of BOM read
453 "", # second byte of BOM read
454 "", # third byte of BOM read
455 "", # fourth byte of BOM read => byteorder known
456 "",
457 "",
458 "",
459 "\x00",
460 "\x00",
461 "\x00",
462 "\x00",
463 "\x00\xff",
464 "\x00\xff",
465 "\x00\xff",
466 "\x00\xff",
467 "\x00\xff\u0100",
468 "\x00\xff\u0100",
469 "\x00\xff\u0100",
470 "\x00\xff\u0100",
471 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200472 "\x00\xff\u0100\uffff",
473 "\x00\xff\u0100\uffff",
474 "\x00\xff\u0100\uffff",
475 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 ]
477 )
478
Georg Brandl791f4e12009-09-17 11:41:24 +0000479 def test_handlers(self):
480 self.assertEqual(('\ufffd', 1),
481 codecs.utf_32_decode(b'\x01', 'replace', True))
482 self.assertEqual(('', 1),
483 codecs.utf_32_decode(b'\x01', 'ignore', True))
484
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485 def test_errors(self):
486 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
487 b"\xff", "strict", True)
488
489 def test_decoder_state(self):
490 self.check_state_handling_decode(self.encoding,
491 "spamspam", self.spamle)
492 self.check_state_handling_decode(self.encoding,
493 "spamspam", self.spambe)
494
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000495 def test_issue8941(self):
496 # Issue #8941: insufficient result allocation when decoding into
497 # surrogate pairs on UCS-2 builds.
498 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
499 self.assertEqual('\U00010000' * 1024,
500 codecs.utf_32_decode(encoded_le)[0])
501 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
502 self.assertEqual('\U00010000' * 1024,
503 codecs.utf_32_decode(encoded_be)[0])
504
Victor Stinnerf96418d2015-09-21 23:06:27 +0200505
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200506class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000507 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200508 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509
510 def test_partial(self):
511 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200512 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000513 [
514 "",
515 "",
516 "",
517 "\x00",
518 "\x00",
519 "\x00",
520 "\x00",
521 "\x00\xff",
522 "\x00\xff",
523 "\x00\xff",
524 "\x00\xff",
525 "\x00\xff\u0100",
526 "\x00\xff\u0100",
527 "\x00\xff\u0100",
528 "\x00\xff\u0100",
529 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200530 "\x00\xff\u0100\uffff",
531 "\x00\xff\u0100\uffff",
532 "\x00\xff\u0100\uffff",
533 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000534 ]
535 )
536
537 def test_simple(self):
538 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
539
540 def test_errors(self):
541 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
542 b"\xff", "strict", True)
543
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000544 def test_issue8941(self):
545 # Issue #8941: insufficient result allocation when decoding into
546 # surrogate pairs on UCS-2 builds.
547 encoded = b'\x00\x00\x01\x00' * 1024
548 self.assertEqual('\U00010000' * 1024,
549 codecs.utf_32_le_decode(encoded)[0])
550
Victor Stinnerf96418d2015-09-21 23:06:27 +0200551
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200552class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000553 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200554 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000555
556 def test_partial(self):
557 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200558 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000559 [
560 "",
561 "",
562 "",
563 "\x00",
564 "\x00",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff",
570 "\x00\xff",
571 "\x00\xff\u0100",
572 "\x00\xff\u0100",
573 "\x00\xff\u0100",
574 "\x00\xff\u0100",
575 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200576 "\x00\xff\u0100\uffff",
577 "\x00\xff\u0100\uffff",
578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000580 ]
581 )
582
583 def test_simple(self):
584 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
585
586 def test_errors(self):
587 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
588 b"\xff", "strict", True)
589
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000590 def test_issue8941(self):
591 # Issue #8941: insufficient result allocation when decoding into
592 # surrogate pairs on UCS-2 builds.
593 encoded = b'\x00\x01\x00\x00' * 1024
594 self.assertEqual('\U00010000' * 1024,
595 codecs.utf_32_be_decode(encoded)[0])
596
597
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200598class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000599 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200600 if sys.byteorder == 'little':
601 ill_formed_sequence = b"\x80\xdc"
602 else:
603 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000604
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000605 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
606 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000607
608 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000609 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000610 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000611 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200612 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 f.write("spam")
614 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000615 d = s.getvalue()
616 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000617 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000618 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000619 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200620 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000621 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000622
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000623 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000624 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200625 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000626 self.assertRaises(UnicodeError, f.read)
627
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000628 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200629 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000630 self.assertRaises(UnicodeError, f.read)
631
Walter Dörwald69652032004-09-07 20:24:22 +0000632 def test_partial(self):
633 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200634 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000635 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000636 "", # first byte of BOM read
637 "", # second byte of BOM read => byteorder known
638 "",
639 "\x00",
640 "\x00",
641 "\x00\xff",
642 "\x00\xff",
643 "\x00\xff\u0100",
644 "\x00\xff\u0100",
645 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200646 "\x00\xff\u0100\uffff",
647 "\x00\xff\u0100\uffff",
648 "\x00\xff\u0100\uffff",
649 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000650 ]
651 )
652
Georg Brandl791f4e12009-09-17 11:41:24 +0000653 def test_handlers(self):
654 self.assertEqual(('\ufffd', 1),
655 codecs.utf_16_decode(b'\x01', 'replace', True))
656 self.assertEqual(('', 1),
657 codecs.utf_16_decode(b'\x01', 'ignore', True))
658
Walter Dörwalde22d3392005-11-17 08:52:34 +0000659 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000660 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000661 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000662
663 def test_decoder_state(self):
664 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000665 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000666 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000667 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000668
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000669 def test_bug691291(self):
670 # Files are always opened in binary mode, even if no binary mode was
671 # specified. This means that no automatic conversion of '\n' is done
672 # on reading and writing.
673 s1 = 'Hello\r\nworld\r\n'
674
675 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200676 self.addCleanup(support.unlink, support.TESTFN)
677 with open(support.TESTFN, 'wb') as fp:
678 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200679 with support.check_warnings(('', DeprecationWarning)):
680 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
681 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200682 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000683
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200684class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000685 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200686 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000687
688 def test_partial(self):
689 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200690 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000691 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000692 "",
693 "\x00",
694 "\x00",
695 "\x00\xff",
696 "\x00\xff",
697 "\x00\xff\u0100",
698 "\x00\xff\u0100",
699 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200700 "\x00\xff\u0100\uffff",
701 "\x00\xff\u0100\uffff",
702 "\x00\xff\u0100\uffff",
703 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000704 ]
705 )
706
Walter Dörwalde22d3392005-11-17 08:52:34 +0000707 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200708 tests = [
709 (b'\xff', '\ufffd'),
710 (b'A\x00Z', 'A\ufffd'),
711 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
712 (b'\x00\xd8', '\ufffd'),
713 (b'\x00\xd8A', '\ufffd'),
714 (b'\x00\xd8A\x00', '\ufffdA'),
715 (b'\x00\xdcA\x00', '\ufffdA'),
716 ]
717 for raw, expected in tests:
718 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
719 raw, 'strict', True)
720 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000721
Victor Stinner53a9dd72010-12-08 22:25:45 +0000722 def test_nonbmp(self):
723 self.assertEqual("\U00010203".encode(self.encoding),
724 b'\x00\xd8\x03\xde')
725 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
726 "\U00010203")
727
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200728class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000729 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200730 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000731
732 def test_partial(self):
733 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200734 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000735 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000736 "",
737 "\x00",
738 "\x00",
739 "\x00\xff",
740 "\x00\xff",
741 "\x00\xff\u0100",
742 "\x00\xff\u0100",
743 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200744 "\x00\xff\u0100\uffff",
745 "\x00\xff\u0100\uffff",
746 "\x00\xff\u0100\uffff",
747 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 ]
749 )
750
Walter Dörwalde22d3392005-11-17 08:52:34 +0000751 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200752 tests = [
753 (b'\xff', '\ufffd'),
754 (b'\x00A\xff', 'A\ufffd'),
755 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
756 (b'\xd8\x00', '\ufffd'),
757 (b'\xd8\x00\xdc', '\ufffd'),
758 (b'\xd8\x00\x00A', '\ufffdA'),
759 (b'\xdc\x00\x00A', '\ufffdA'),
760 ]
761 for raw, expected in tests:
762 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
763 raw, 'strict', True)
764 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000765
Victor Stinner53a9dd72010-12-08 22:25:45 +0000766 def test_nonbmp(self):
767 self.assertEqual("\U00010203".encode(self.encoding),
768 b'\xd8\x00\xde\x03')
769 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
770 "\U00010203")
771
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200772class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000773 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200774 ill_formed_sequence = b"\xed\xb2\x80"
775 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200776 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000777
778 def test_partial(self):
779 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200780 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000781 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000782 "\x00",
783 "\x00",
784 "\x00\xff",
785 "\x00\xff",
786 "\x00\xff\u07ff",
787 "\x00\xff\u07ff",
788 "\x00\xff\u07ff",
789 "\x00\xff\u07ff\u0800",
790 "\x00\xff\u07ff\u0800",
791 "\x00\xff\u07ff\u0800",
792 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200793 "\x00\xff\u07ff\u0800\uffff",
794 "\x00\xff\u07ff\u0800\uffff",
795 "\x00\xff\u07ff\u0800\uffff",
796 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000797 ]
798 )
799
Walter Dörwald3abcb012007-04-16 22:10:50 +0000800 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000802 self.check_state_handling_decode(self.encoding,
803 u, u.encode(self.encoding))
804
Victor Stinner1d65d912015-10-05 13:43:50 +0200805 def test_decode_error(self):
806 for data, error_handler, expected in (
807 (b'[\x80\xff]', 'ignore', '[]'),
808 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
809 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
810 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
811 ):
812 with self.subTest(data=data, error_handler=error_handler,
813 expected=expected):
814 self.assertEqual(data.decode(self.encoding, error_handler),
815 expected)
816
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000817 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200818 super().test_lone_surrogates()
819 # not sure if this is making sense for
820 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200821 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
822 self.BOM + b'[\x80]')
823
824 with self.assertRaises(UnicodeEncodeError) as cm:
825 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
826 exc = cm.exception
827 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000828
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000829 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200830 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
831 self.BOM + b"abc\xed\xa0\x80def")
832 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
833 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
834 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
835 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
836
837 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000838 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200839 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200841
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000842 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700843 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200844 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200845 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200846 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000847
Victor Stinnerf96418d2015-09-21 23:06:27 +0200848
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200849@unittest.skipUnless(sys.platform == 'win32',
850 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200851class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200852 encoding = "cp65001"
853
854 def test_encode(self):
855 tests = [
856 ('abc', 'strict', b'abc'),
857 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
858 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700859 ('\udc80', 'strict', None),
860 ('\udc80', 'ignore', b''),
861 ('\udc80', 'replace', b'?'),
862 ('\udc80', 'backslashreplace', b'\\udc80'),
863 ('\udc80', 'namereplace', b'\\udc80'),
864 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200865 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200866 for text, errors, expected in tests:
867 if expected is not None:
868 try:
869 encoded = text.encode('cp65001', errors)
870 except UnicodeEncodeError as err:
871 self.fail('Unable to encode %a to cp65001 with '
872 'errors=%r: %s' % (text, errors, err))
873 self.assertEqual(encoded, expected,
874 '%a.encode("cp65001", %r)=%a != %a'
875 % (text, errors, encoded, expected))
876 else:
877 self.assertRaises(UnicodeEncodeError,
878 text.encode, "cp65001", errors)
879
880 def test_decode(self):
881 tests = [
882 (b'abc', 'strict', 'abc'),
883 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
884 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
885 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
886 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
887 # invalid bytes
888 (b'[\xff]', 'strict', None),
889 (b'[\xff]', 'ignore', '[]'),
890 (b'[\xff]', 'replace', '[\ufffd]'),
891 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700892 (b'[\xed\xb2\x80]', 'strict', None),
893 (b'[\xed\xb2\x80]', 'ignore', '[]'),
894 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200895 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200896 for raw, errors, expected in tests:
897 if expected is not None:
898 try:
899 decoded = raw.decode('cp65001', errors)
900 except UnicodeDecodeError as err:
901 self.fail('Unable to decode %a from cp65001 with '
902 'errors=%r: %s' % (raw, errors, err))
903 self.assertEqual(decoded, expected,
904 '%a.decode("cp65001", %r)=%a != %a'
905 % (raw, errors, decoded, expected))
906 else:
907 self.assertRaises(UnicodeDecodeError,
908 raw.decode, 'cp65001', errors)
909
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200910 def test_lone_surrogates(self):
911 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
912 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
913 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
914 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200915 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
916 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200917 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
918 b'[&#56448;]')
919 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
920 b'[\x80]')
921 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
922 b'[]')
923 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
924 b'[?]')
925
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200926 def test_surrogatepass_handler(self):
927 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
928 b"abc\xed\xa0\x80def")
929 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
930 "abc\ud800def")
931 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
932 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
933 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
934 "\U00010fff\uD800")
935 self.assertTrue(codecs.lookup_error("surrogatepass"))
936
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200937
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200938class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939 encoding = "utf-7"
940
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300941 def test_ascii(self):
942 # Set D (directly encoded characters)
943 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
944 'abcdefghijklmnopqrstuvwxyz'
945 '0123456789'
946 '\'(),-./:?')
947 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
948 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
949 # Set O (optional direct characters)
950 set_o = ' !"#$%&*;<=>@[]^_`{|}'
951 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
952 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
953 # +
954 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
955 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
956 # White spaces
957 ws = ' \t\n\r'
958 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
959 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
960 # Other ASCII characters
961 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
962 set(set_d + set_o + '+' + ws)))
963 self.assertEqual(other_ascii.encode(self.encoding),
964 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
965 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
966
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000967 def test_partial(self):
968 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200969 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000970 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200971 'a',
972 'a',
973 'a+',
974 'a+-',
975 'a+-b',
976 'a+-b',
977 'a+-b',
978 'a+-b',
979 'a+-b',
980 'a+-b\x00',
981 'a+-b\x00c',
982 'a+-b\x00c',
983 'a+-b\x00c',
984 'a+-b\x00c',
985 'a+-b\x00c',
986 'a+-b\x00c\x80',
987 'a+-b\x00c\x80d',
988 'a+-b\x00c\x80d',
989 'a+-b\x00c\x80d',
990 'a+-b\x00c\x80d',
991 'a+-b\x00c\x80d',
992 'a+-b\x00c\x80d\u0100',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e',
996 'a+-b\x00c\x80d\u0100e',
997 'a+-b\x00c\x80d\u0100e',
998 'a+-b\x00c\x80d\u0100e',
999 'a+-b\x00c\x80d\u0100e',
1000 'a+-b\x00c\x80d\u0100e',
1001 'a+-b\x00c\x80d\u0100e\U00010000',
1002 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001003 ]
1004 )
Walter Dörwalde22d3392005-11-17 08:52:34 +00001005
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001006 def test_errors(self):
1007 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001008 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001009 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001010 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001011 (b'a+IK', 'a\ufffd'),
1012 (b'a+IK-b', 'a\ufffdb'),
1013 (b'a+IK,b', 'a\ufffdb'),
1014 (b'a+IKx', 'a\u20ac\ufffd'),
1015 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1016 (b'a+IKwgr', 'a\u20ac\ufffd'),
1017 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1018 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1019 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1020 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1021 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1022 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1023 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1024 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1025 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001026 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1027 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Zackery Spytze349bf22018-08-18 22:43:38 -06001028 (b'a+@b', 'a\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001029 ]
1030 for raw, expected in tests:
1031 with self.subTest(raw=raw):
1032 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1033 raw, 'strict', True)
1034 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1035
1036 def test_nonbmp(self):
1037 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1038 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1039 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001040 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1041 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1042 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1043 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1044 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1045 b'+IKwgrNgB3KA-')
1046 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1047 '\u20ac\u20ac\U000104A0')
1048 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1049 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001050
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001051 def test_lone_surrogates(self):
1052 tests = [
1053 (b'a+2AE-b', 'a\ud801b'),
1054 (b'a+2AE\xffb', 'a\ufffdb'),
1055 (b'a+2AE', 'a\ufffd'),
1056 (b'a+2AEA-b', 'a\ufffdb'),
1057 (b'a+2AH-b', 'a\ufffdb'),
1058 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1059 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1060 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1061 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1062 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1063 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1064 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1065 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1066 ]
1067 for raw, expected in tests:
1068 with self.subTest(raw=raw):
1069 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001070
1071
Walter Dörwalde22d3392005-11-17 08:52:34 +00001072class UTF16ExTest(unittest.TestCase):
1073
1074 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001075 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001076
1077 def test_bad_args(self):
1078 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1079
1080class ReadBufferTest(unittest.TestCase):
1081
1082 def test_array(self):
1083 import array
1084 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001085 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001086 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001087 )
1088
1089 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001090 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001091
1092 def test_bad_args(self):
1093 self.assertRaises(TypeError, codecs.readbuffer_encode)
1094 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1095
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001096class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001097 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001098 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001099
1100 def test_partial(self):
1101 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001102 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001103 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001104 "",
1105 "",
1106 "", # First BOM has been read and skipped
1107 "",
1108 "",
1109 "\ufeff", # Second BOM has been read and emitted
1110 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001111 "\ufeff\x00", # First byte of encoded "\xff" read
1112 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1113 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1114 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001115 "\ufeff\x00\xff\u07ff",
1116 "\ufeff\x00\xff\u07ff",
1117 "\ufeff\x00\xff\u07ff\u0800",
1118 "\ufeff\x00\xff\u07ff\u0800",
1119 "\ufeff\x00\xff\u07ff\u0800",
1120 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001121 "\ufeff\x00\xff\u07ff\u0800\uffff",
1122 "\ufeff\x00\xff\u07ff\u0800\uffff",
1123 "\ufeff\x00\xff\u07ff\u0800\uffff",
1124 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001125 ]
1126 )
1127
Thomas Wouters89f507f2006-12-13 04:49:30 +00001128 def test_bug1601501(self):
1129 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001130 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001131
Walter Dörwald3abcb012007-04-16 22:10:50 +00001132 def test_bom(self):
1133 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001135 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1136
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001137 def test_stream_bom(self):
1138 unistring = "ABC\u00A1\u2200XYZ"
1139 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1140
1141 reader = codecs.getreader("utf-8-sig")
1142 for sizehint in [None] + list(range(1, 11)) + \
1143 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001144 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001145 ostream = io.StringIO()
1146 while 1:
1147 if sizehint is not None:
1148 data = istream.read(sizehint)
1149 else:
1150 data = istream.read()
1151
1152 if not data:
1153 break
1154 ostream.write(data)
1155
1156 got = ostream.getvalue()
1157 self.assertEqual(got, unistring)
1158
1159 def test_stream_bare(self):
1160 unistring = "ABC\u00A1\u2200XYZ"
1161 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1162
1163 reader = codecs.getreader("utf-8-sig")
1164 for sizehint in [None] + list(range(1, 11)) + \
1165 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001166 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001167 ostream = io.StringIO()
1168 while 1:
1169 if sizehint is not None:
1170 data = istream.read(sizehint)
1171 else:
1172 data = istream.read()
1173
1174 if not data:
1175 break
1176 ostream.write(data)
1177
1178 got = ostream.getvalue()
1179 self.assertEqual(got, unistring)
1180
1181class EscapeDecodeTest(unittest.TestCase):
1182 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001183 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001184 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001185
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001186 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001188 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001189 b = bytes([b])
1190 if b != b'\\':
1191 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001192
1193 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001194 decode = codecs.escape_decode
1195 check = coding_checker(self, decode)
1196 check(b"[\\\n]", b"[]")
1197 check(br'[\"]', b'["]')
1198 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001199 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001200 check(br"[\a]", b"[\x07]")
1201 check(br"[\b]", b"[\x08]")
1202 check(br"[\t]", b"[\x09]")
1203 check(br"[\n]", b"[\x0a]")
1204 check(br"[\v]", b"[\x0b]")
1205 check(br"[\f]", b"[\x0c]")
1206 check(br"[\r]", b"[\x0d]")
1207 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001208 check(br"[\78]", b"[\x078]")
1209 check(br"[\41]", b"[!]")
1210 check(br"[\418]", b"[!8]")
1211 check(br"[\101]", b"[A]")
1212 check(br"[\1010]", b"[A0]")
1213 check(br"[\501]", b"[A]")
1214 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001215 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001216 for i in range(97, 123):
1217 b = bytes([i])
1218 if b not in b'abfnrtvx':
1219 with self.assertWarns(DeprecationWarning):
1220 check(b"\\" + b, b"\\" + b)
1221 with self.assertWarns(DeprecationWarning):
1222 check(b"\\" + b.upper(), b"\\" + b.upper())
1223 with self.assertWarns(DeprecationWarning):
1224 check(br"\8", b"\\8")
1225 with self.assertWarns(DeprecationWarning):
1226 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001227 with self.assertWarns(DeprecationWarning):
1228 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001229
1230 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001231 decode = codecs.escape_decode
1232 self.assertRaises(ValueError, decode, br"\x")
1233 self.assertRaises(ValueError, decode, br"[\x]")
1234 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1235 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1236 self.assertRaises(ValueError, decode, br"\x0")
1237 self.assertRaises(ValueError, decode, br"[\x0]")
1238 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1239 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001240
Victor Stinnerf96418d2015-09-21 23:06:27 +02001241
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001242class RecodingTest(unittest.TestCase):
1243 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001244 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001245 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001246 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001247 f2.close()
1248 # Python used to crash on this at exit because of a refcount
1249 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001250
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001251 self.assertTrue(f.closed)
1252
Martin v. Löwis2548c732003-04-18 10:39:54 +00001253# From RFC 3492
1254punycode_testcases = [
1255 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001256 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1257 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001258 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001263 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001264 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001266 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1267 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1268 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001269 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001271 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1272 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1273 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001274 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001276 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001277 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1278 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1279 "\u0939\u0948\u0902",
1280 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281
1282 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001283 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001284 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1285 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286
1287 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001288 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1289 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1290 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001291 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1292 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293
1294 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001295 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1296 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1297 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1298 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001299 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001300
1301 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001302 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1303 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1304 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1305 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1306 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001307 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308
1309 # (K) Vietnamese:
1310 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1311 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001312 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1313 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1314 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1315 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001316 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001319 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001320 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001321
Martin v. Löwis2548c732003-04-18 10:39:54 +00001322 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001323 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1324 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1325 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001326 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327
1328 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001329 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1330 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1331 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001332 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333
1334 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001335 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001336 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337
1338 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001339 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1340 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001341 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001342
1343 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001345 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346
1347 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001348 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001349 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350
1351 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001352 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1353 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001354 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 ]
1356
1357for i in punycode_testcases:
1358 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001359 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360
Victor Stinnerf96418d2015-09-21 23:06:27 +02001361
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362class PunycodeTest(unittest.TestCase):
1363 def test_encode(self):
1364 for uni, puny in punycode_testcases:
1365 # Need to convert both strings to lower case, since
1366 # some of the extended encodings use upper case, but our
1367 # code produces only lower case. Converting just puny to
1368 # lower is also insufficient, since some of the input characters
1369 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001370 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001371 str(uni.encode("punycode"), "ascii").lower(),
1372 str(puny, "ascii").lower()
1373 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374
1375 def test_decode(self):
1376 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001377 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001378 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001379 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001380
Victor Stinnerf96418d2015-09-21 23:06:27 +02001381
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001382class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001383 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001384 def test_bug1251300(self):
1385 # Decoding with unicode_internal used to not correctly handle "code
1386 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001387 ok = [
1388 (b"\x00\x10\xff\xff", "\U0010ffff"),
1389 (b"\x00\x00\x01\x01", "\U00000101"),
1390 (b"", ""),
1391 ]
1392 not_ok = [
1393 b"\x7f\xff\xff\xff",
1394 b"\x80\x00\x00\x00",
1395 b"\x81\x00\x00\x00",
1396 b"\x00",
1397 b"\x00\x00\x00\x00\x00",
1398 ]
1399 for internal, uni in ok:
1400 if sys.byteorder == "little":
1401 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001402 with support.check_warnings():
1403 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001404 for internal in not_ok:
1405 if sys.byteorder == "little":
1406 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001407 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001408 'deprecated', DeprecationWarning)):
1409 self.assertRaises(UnicodeDecodeError, internal.decode,
1410 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001411 if sys.byteorder == "little":
1412 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001413 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001414 else:
1415 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001416 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001417 with support.check_warnings():
1418 self.assertRaises(UnicodeDecodeError,
1419 invalid.decode, "unicode_internal")
1420 with support.check_warnings():
1421 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1422 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001423 with support.check_warnings():
1424 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1425 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001426
Victor Stinner182d90d2011-09-29 19:53:55 +02001427 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001428 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001429 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001430 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001431 'deprecated', DeprecationWarning)):
1432 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001433 except UnicodeDecodeError as ex:
1434 self.assertEqual("unicode_internal", ex.encoding)
1435 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1436 self.assertEqual(4, ex.start)
1437 self.assertEqual(8, ex.end)
1438 else:
1439 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001440
Victor Stinner182d90d2011-09-29 19:53:55 +02001441 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001442 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001443 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1444 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001445 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001446 'deprecated', DeprecationWarning)):
1447 ab = "ab".encode("unicode_internal").decode()
1448 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1449 "ascii"),
1450 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001451 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001452
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001453 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001454 with support.check_warnings(('unicode_internal codec has been '
1455 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001456 # Issue 3739
1457 encoder = codecs.getencoder("unicode_internal")
1458 self.assertEqual(encoder("a")[1], 1)
1459 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1460
1461 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001462
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1464nameprep_tests = [
1465 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001466 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1467 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1468 b'\xb8\x8f\xef\xbb\xbf',
1469 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'CAFE',
1472 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.3 Case folding 8bit U+00DF (german sharp s).
1474 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001475 (b'\xc3\x9f',
1476 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001478 (b'\xc4\xb0',
1479 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'\xc5\x83\xcd\xba',
1482 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1484 # XXX: skip this as it fails in UCS-2 mode
1485 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1486 # 'telc\xe2\x88\x95kg\xcf\x83'),
1487 (None, None),
1488 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1490 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001491 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'\xe1\xbe\xb7',
1493 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001494 # 3.9 Self-reverting case folding U+01F0 and normalization.
1495 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 (b'\xc7\xb0',
1497 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'\xce\x90',
1500 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001502 (b'\xce\xb0',
1503 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 (b'\xe1\xba\x96',
1506 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001508 (b'\xe1\xbd\x96',
1509 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001511 (b' ',
1512 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001514 (b'\xc2\xa0',
1515 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001517 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001518 None),
1519 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 (b'\xe2\x80\x80',
1521 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001522 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001523 (b'\xe2\x80\x8b',
1524 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001525 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001526 (b'\xe3\x80\x80',
1527 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001528 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001529 (b'\x10\x7f',
1530 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001532 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001533 None),
1534 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001535 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001536 None),
1537 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001538 (b'\xef\xbb\xbf',
1539 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001540 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001541 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001542 None),
1543 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001544 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001545 None),
1546 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001548 None),
1549 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001550 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001551 None),
1552 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001553 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001554 None),
1555 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001556 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001557 None),
1558 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001559 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001560 None),
1561 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001562 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001563 None),
1564 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001565 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001566 None),
1567 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 (b'\xcd\x81',
1569 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001570 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001572 None),
1573 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001575 None),
1576 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001577 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001578 None),
1579 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001581 None),
1582 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001584 None),
1585 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001586 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001587 None),
1588 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001589 (b'foo\xef\xb9\xb6bar',
1590 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001591 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001592 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001593 None),
1594 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001595 (b'\xd8\xa71\xd8\xa8',
1596 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001597 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001598 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001599 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001600 # None),
1601 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001602 # 3.44 Larger test (shrinking).
1603 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001604 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1605 b'\xaa\xce\xb0\xe2\x80\x80',
1606 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001607 # 3.45 Larger test (expanding).
1608 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001609 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1610 b'\x80',
1611 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1612 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1613 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001614 ]
1615
1616
1617class NameprepTest(unittest.TestCase):
1618 def test_nameprep(self):
1619 from encodings.idna import nameprep
1620 for pos, (orig, prepped) in enumerate(nameprep_tests):
1621 if orig is None:
1622 # Skipped
1623 continue
1624 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001625 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001626 if prepped is None:
1627 # Input contains prohibited characters
1628 self.assertRaises(UnicodeError, nameprep, orig)
1629 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001630 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001631 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001633 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001634 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001635
Victor Stinnerf96418d2015-09-21 23:06:27 +02001636
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001637class IDNACodecTest(unittest.TestCase):
1638 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001639 self.assertEqual(str(b"python.org", "idna"), "python.org")
1640 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1641 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1642 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001643
1644 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001645 self.assertEqual("python.org".encode("idna"), b"python.org")
1646 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1647 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1648 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001649
Martin v. Löwis8b595142005-08-25 11:03:38 +00001650 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001651 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001652 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001653 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001654
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001655 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001656 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001657 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001658 "python.org"
1659 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001660 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001661 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001662 "python.org."
1663 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001664 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001665 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001666 "pyth\xf6n.org."
1667 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001668 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001669 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001670 "pyth\xf6n.org."
1671 )
1672
1673 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001674 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1675 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1676 self.assertEqual(decoder.decode(b"rg"), "")
1677 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001678
1679 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001680 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1681 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1682 self.assertEqual(decoder.decode(b"rg."), "org.")
1683 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001684
1685 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001686 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001687 b"".join(codecs.iterencode("python.org", "idna")),
1688 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001689 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001690 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001691 b"".join(codecs.iterencode("python.org.", "idna")),
1692 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001693 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001694 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001695 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1696 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001697 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001698 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001699 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1700 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001701 )
1702
1703 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001704 self.assertEqual(encoder.encode("\xe4x"), b"")
1705 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1706 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001707
1708 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001709 self.assertEqual(encoder.encode("\xe4x"), b"")
1710 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1711 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001712
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001713 def test_errors(self):
1714 """Only supports "strict" error handler"""
1715 "python.org".encode("idna", "strict")
1716 b"python.org".decode("idna", "strict")
1717 for errors in ("ignore", "replace", "backslashreplace",
1718 "surrogateescape"):
1719 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1720 self.assertRaises(Exception,
1721 b"python.org".decode, "idna", errors)
1722
Victor Stinnerf96418d2015-09-21 23:06:27 +02001723
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001724class CodecsModuleTest(unittest.TestCase):
1725
1726 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001727 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1728 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001729 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001730 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001731 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001732
Victor Stinnera57dfd02014-05-14 17:13:14 +02001733 # test keywords
1734 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1735 '\xe4\xf6\xfc')
1736 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1737 '[]')
1738
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001739 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001740 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1741 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001742 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001743 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001744 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001745 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001746
Victor Stinnera57dfd02014-05-14 17:13:14 +02001747 # test keywords
1748 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1749 b'\xe4\xf6\xfc')
1750 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1751 b'[]')
1752
Walter Dörwald063e1e82004-10-28 13:04:26 +00001753 def test_register(self):
1754 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001755 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001756
1757 def test_lookup(self):
1758 self.assertRaises(TypeError, codecs.lookup)
1759 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001760 self.assertRaises(LookupError, codecs.lookup, " ")
1761
1762 def test_getencoder(self):
1763 self.assertRaises(TypeError, codecs.getencoder)
1764 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1765
1766 def test_getdecoder(self):
1767 self.assertRaises(TypeError, codecs.getdecoder)
1768 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1769
1770 def test_getreader(self):
1771 self.assertRaises(TypeError, codecs.getreader)
1772 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1773
1774 def test_getwriter(self):
1775 self.assertRaises(TypeError, codecs.getwriter)
1776 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001777
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001778 def test_lookup_issue1813(self):
1779 # Issue #1813: under Turkish locales, lookup of some codecs failed
1780 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001781 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001782 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1783 try:
1784 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1785 except locale.Error:
1786 # Unsupported locale on this system
1787 self.skipTest('test needs Turkish locale')
1788 c = codecs.lookup('ASCII')
1789 self.assertEqual(c.name, 'ascii')
1790
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001791 def test_all(self):
1792 api = (
1793 "encode", "decode",
1794 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1795 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1796 "getencoder", "getdecoder", "getincrementalencoder",
1797 "getincrementaldecoder", "getreader", "getwriter",
1798 "register_error", "lookup_error",
1799 "strict_errors", "replace_errors", "ignore_errors",
1800 "xmlcharrefreplace_errors", "backslashreplace_errors",
1801 "namereplace_errors",
1802 "open", "EncodedFile",
1803 "iterencode", "iterdecode",
1804 "BOM", "BOM_BE", "BOM_LE",
1805 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1806 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1807 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1808 "StreamReaderWriter", "StreamRecoder",
1809 )
1810 self.assertCountEqual(api, codecs.__all__)
1811 for api in codecs.__all__:
1812 getattr(codecs, api)
1813
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001814 def test_open(self):
1815 self.addCleanup(support.unlink, support.TESTFN)
1816 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1817 with self.subTest(mode), \
1818 codecs.open(support.TESTFN, mode, 'ascii') as file:
1819 self.assertIsInstance(file, codecs.StreamReaderWriter)
1820
1821 def test_undefined(self):
1822 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1823 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1824 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1825 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1826 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1827 self.assertRaises(UnicodeError,
1828 codecs.encode, 'abc', 'undefined', errors)
1829 self.assertRaises(UnicodeError,
1830 codecs.decode, b'abc', 'undefined', errors)
1831
Victor Stinnerf96418d2015-09-21 23:06:27 +02001832
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001833class StreamReaderTest(unittest.TestCase):
1834
1835 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001836 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001837 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001838
1839 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001840 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001841 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001842
Victor Stinnerf96418d2015-09-21 23:06:27 +02001843
Thomas Wouters89f507f2006-12-13 04:49:30 +00001844class EncodedFileTest(unittest.TestCase):
1845
1846 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001847 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001848 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001849 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001850
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001851 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001852 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001853 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001854 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001855
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856all_unicode_encodings = [
1857 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858 "big5",
1859 "big5hkscs",
1860 "charmap",
1861 "cp037",
1862 "cp1006",
1863 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001864 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001865 "cp1140",
1866 "cp1250",
1867 "cp1251",
1868 "cp1252",
1869 "cp1253",
1870 "cp1254",
1871 "cp1255",
1872 "cp1256",
1873 "cp1257",
1874 "cp1258",
1875 "cp424",
1876 "cp437",
1877 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001878 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001879 "cp737",
1880 "cp775",
1881 "cp850",
1882 "cp852",
1883 "cp855",
1884 "cp856",
1885 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001886 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001887 "cp860",
1888 "cp861",
1889 "cp862",
1890 "cp863",
1891 "cp864",
1892 "cp865",
1893 "cp866",
1894 "cp869",
1895 "cp874",
1896 "cp875",
1897 "cp932",
1898 "cp949",
1899 "cp950",
1900 "euc_jis_2004",
1901 "euc_jisx0213",
1902 "euc_jp",
1903 "euc_kr",
1904 "gb18030",
1905 "gb2312",
1906 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001907 "hp_roman8",
1908 "hz",
1909 "idna",
1910 "iso2022_jp",
1911 "iso2022_jp_1",
1912 "iso2022_jp_2",
1913 "iso2022_jp_2004",
1914 "iso2022_jp_3",
1915 "iso2022_jp_ext",
1916 "iso2022_kr",
1917 "iso8859_1",
1918 "iso8859_10",
1919 "iso8859_11",
1920 "iso8859_13",
1921 "iso8859_14",
1922 "iso8859_15",
1923 "iso8859_16",
1924 "iso8859_2",
1925 "iso8859_3",
1926 "iso8859_4",
1927 "iso8859_5",
1928 "iso8859_6",
1929 "iso8859_7",
1930 "iso8859_8",
1931 "iso8859_9",
1932 "johab",
1933 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001934 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001935 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001936 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001937 "latin_1",
1938 "mac_cyrillic",
1939 "mac_greek",
1940 "mac_iceland",
1941 "mac_latin2",
1942 "mac_roman",
1943 "mac_turkish",
1944 "palmos",
1945 "ptcp154",
1946 "punycode",
1947 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001948 "shift_jis",
1949 "shift_jis_2004",
1950 "shift_jisx0213",
1951 "tis_620",
1952 "unicode_escape",
1953 "unicode_internal",
1954 "utf_16",
1955 "utf_16_be",
1956 "utf_16_le",
1957 "utf_7",
1958 "utf_8",
1959]
1960
1961if hasattr(codecs, "mbcs_encode"):
1962 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001963if hasattr(codecs, "oem_encode"):
1964 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001965
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001966# The following encoding is not tested, because it's not supposed
1967# to work:
1968# "undefined"
1969
1970# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001971broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001972 "punycode",
1973 "unicode_internal"
1974]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001975
Victor Stinnerf96418d2015-09-21 23:06:27 +02001976
Walter Dörwald3abcb012007-04-16 22:10:50 +00001977class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001978 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001979 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001980 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001981 name = codecs.lookup(encoding).name
1982 if encoding.endswith("_codec"):
1983 name += "_codec"
1984 elif encoding == "latin_1":
1985 name = "latin_1"
1986 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001987
Ezio Melottiadc417c2011-11-17 12:23:34 +02001988 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001989 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001990 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001991 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001992 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001993 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001994
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001995 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001996 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001997 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001998 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001999 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002000 for c in s:
2001 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002002 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00002003 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00002004 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002005 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02002006 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002007 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002008 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002009 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002010 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002011 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002012
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002013 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002014 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002015 try:
2016 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002017 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002018 pass
2019 else:
2020 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002021 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002022 for c in s:
2023 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002024 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002025 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002026 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002027 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002028 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002029 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002030 self.assertEqual(decodedresult, s,
2031 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002032
2033 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002034 result = "".join(codecs.iterdecode(
2035 codecs.iterencode(s, encoding), encoding))
2036 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002037
2038 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002039 result = "".join(codecs.iterdecode(
2040 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002041 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002042
Victor Stinner554f3f02010-06-16 23:33:54 +00002043 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002044 # check incremental decoder/encoder with errors argument
2045 try:
2046 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002047 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002048 pass
2049 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002050 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002051 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002052 decodedresult = "".join(decoder.decode(bytes([c]))
2053 for c in encodedresult)
2054 self.assertEqual(decodedresult, s,
2055 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002056
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002057 @support.cpython_only
2058 def test_basics_capi(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002059 s = "abc123" # all codecs should be able to encode these
2060 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002061 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002062 # check incremental decoder/encoder (fetched via the C API)
2063 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02002064 cencoder = _testcapi.codec_incrementalencoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002065 except LookupError: # no IncrementalEncoder
2066 pass
2067 else:
2068 # check C API
2069 encodedresult = b""
2070 for c in s:
2071 encodedresult += cencoder.encode(c)
2072 encodedresult += cencoder.encode("", True)
Victor Stinner3d4226a2018-08-29 22:21:32 +02002073 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002074 decodedresult = ""
2075 for c in encodedresult:
2076 decodedresult += cdecoder.decode(bytes([c]))
2077 decodedresult += cdecoder.decode(b"", True)
2078 self.assertEqual(decodedresult, s,
2079 "encoding=%r" % encoding)
2080
2081 if encoding not in ("idna", "mbcs"):
2082 # check incremental decoder/encoder with errors argument
2083 try:
Victor Stinner3d4226a2018-08-29 22:21:32 +02002084 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002085 except LookupError: # no IncrementalEncoder
2086 pass
2087 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002088 encodedresult = b"".join(cencoder.encode(c) for c in s)
Victor Stinner3d4226a2018-08-29 22:21:32 +02002089 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002090 decodedresult = "".join(cdecoder.decode(bytes([c]))
2091 for c in encodedresult)
2092 self.assertEqual(decodedresult, s,
2093 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002094
Walter Dörwald729c31f2005-03-14 19:06:30 +00002095 def test_seek(self):
2096 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002097 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002098 for encoding in all_unicode_encodings:
2099 if encoding == "idna": # FIXME: See SF bug #1163178
2100 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002101 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002102 continue
Victor Stinner05010702011-05-27 16:50:40 +02002103 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002104 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002105 # Test that calling seek resets the internal codec state and buffers
2106 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002107 data = reader.read()
2108 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002109
Walter Dörwalde22d3392005-11-17 08:52:34 +00002110 def test_bad_decode_args(self):
2111 for encoding in all_unicode_encodings:
2112 decoder = codecs.getdecoder(encoding)
2113 self.assertRaises(TypeError, decoder)
2114 if encoding not in ("idna", "punycode"):
2115 self.assertRaises(TypeError, decoder, 42)
2116
2117 def test_bad_encode_args(self):
2118 for encoding in all_unicode_encodings:
2119 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002120 with support.check_warnings():
2121 # unicode-internal has been deprecated
2122 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002123
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002124 def test_encoding_map_type_initialized(self):
2125 from encodings import cp1140
2126 # This used to crash, we are only verifying there's no crash.
2127 table_type = type(cp1140.encoding_table)
2128 self.assertEqual(table_type, table_type)
2129
Walter Dörwald3abcb012007-04-16 22:10:50 +00002130 def test_decoder_state(self):
2131 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002132 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002133 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002134 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002135 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2136 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2137
Victor Stinnerf96418d2015-09-21 23:06:27 +02002138
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002139class CharmapTest(unittest.TestCase):
2140 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002141 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002142 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002143 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002144 )
2145
Ezio Melottib3aedd42010-11-20 19:04:17 +00002146 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002147 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2148 ("\U0010FFFFbc", 3)
2149 )
2150
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002151 self.assertRaises(UnicodeDecodeError,
2152 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2153 )
2154
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002155 self.assertRaises(UnicodeDecodeError,
2156 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2157 )
2158
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002159 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002160 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002161 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002162 )
2163
Ezio Melottib3aedd42010-11-20 19:04:17 +00002164 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002165 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002166 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002167 )
2168
Ezio Melottib3aedd42010-11-20 19:04:17 +00002169 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002170 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2171 ("ab\\x02", 3)
2172 )
2173
2174 self.assertEqual(
2175 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2176 ("ab\\x02", 3)
2177 )
2178
2179 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002180 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002181 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002182 )
2183
Ezio Melottib3aedd42010-11-20 19:04:17 +00002184 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002185 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002186 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002187 )
2188
Guido van Rossum805365e2007-05-07 22:24:25 +00002189 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002190 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002191 codecs.charmap_decode(allbytes, "ignore", ""),
2192 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002193 )
2194
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002195 def test_decode_with_int2str_map(self):
2196 self.assertEqual(
2197 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2198 {0: 'a', 1: 'b', 2: 'c'}),
2199 ("abc", 3)
2200 )
2201
2202 self.assertEqual(
2203 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2204 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2205 ("AaBbCc", 3)
2206 )
2207
2208 self.assertEqual(
2209 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2210 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2211 ("\U0010FFFFbc", 3)
2212 )
2213
2214 self.assertEqual(
2215 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2216 {0: 'a', 1: 'b', 2: ''}),
2217 ("ab", 3)
2218 )
2219
2220 self.assertRaises(UnicodeDecodeError,
2221 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2222 {0: 'a', 1: 'b'}
2223 )
2224
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002225 self.assertRaises(UnicodeDecodeError,
2226 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2227 {0: 'a', 1: 'b', 2: None}
2228 )
2229
2230 # Issue #14850
2231 self.assertRaises(UnicodeDecodeError,
2232 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2233 {0: 'a', 1: 'b', 2: '\ufffe'}
2234 )
2235
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002236 self.assertEqual(
2237 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2238 {0: 'a', 1: 'b'}),
2239 ("ab\ufffd", 3)
2240 )
2241
2242 self.assertEqual(
2243 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2244 {0: 'a', 1: 'b', 2: None}),
2245 ("ab\ufffd", 3)
2246 )
2247
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002248 # Issue #14850
2249 self.assertEqual(
2250 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2251 {0: 'a', 1: 'b', 2: '\ufffe'}),
2252 ("ab\ufffd", 3)
2253 )
2254
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002255 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002256 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2257 {0: 'a', 1: 'b'}),
2258 ("ab\\x02", 3)
2259 )
2260
2261 self.assertEqual(
2262 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2263 {0: 'a', 1: 'b', 2: None}),
2264 ("ab\\x02", 3)
2265 )
2266
2267 # Issue #14850
2268 self.assertEqual(
2269 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2270 {0: 'a', 1: 'b', 2: '\ufffe'}),
2271 ("ab\\x02", 3)
2272 )
2273
2274 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002275 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2276 {0: 'a', 1: 'b'}),
2277 ("ab", 3)
2278 )
2279
2280 self.assertEqual(
2281 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2282 {0: 'a', 1: 'b', 2: None}),
2283 ("ab", 3)
2284 )
2285
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002286 # Issue #14850
2287 self.assertEqual(
2288 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2289 {0: 'a', 1: 'b', 2: '\ufffe'}),
2290 ("ab", 3)
2291 )
2292
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002293 allbytes = bytes(range(256))
2294 self.assertEqual(
2295 codecs.charmap_decode(allbytes, "ignore", {}),
2296 ("", len(allbytes))
2297 )
2298
2299 def test_decode_with_int2int_map(self):
2300 a = ord('a')
2301 b = ord('b')
2302 c = ord('c')
2303
2304 self.assertEqual(
2305 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2306 {0: a, 1: b, 2: c}),
2307 ("abc", 3)
2308 )
2309
2310 # Issue #15379
2311 self.assertEqual(
2312 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2313 {0: 0x10FFFF, 1: b, 2: c}),
2314 ("\U0010FFFFbc", 3)
2315 )
2316
Antoine Pitroua1f76552012-09-23 20:00:04 +02002317 self.assertEqual(
2318 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2319 {0: sys.maxunicode, 1: b, 2: c}),
2320 (chr(sys.maxunicode) + "bc", 3)
2321 )
2322
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002323 self.assertRaises(TypeError,
2324 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002325 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002326 )
2327
2328 self.assertRaises(UnicodeDecodeError,
2329 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2330 {0: a, 1: b},
2331 )
2332
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002333 self.assertRaises(UnicodeDecodeError,
2334 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2335 {0: a, 1: b, 2: 0xFFFE},
2336 )
2337
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002338 self.assertEqual(
2339 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2340 {0: a, 1: b}),
2341 ("ab\ufffd", 3)
2342 )
2343
2344 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002345 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2346 {0: a, 1: b, 2: 0xFFFE}),
2347 ("ab\ufffd", 3)
2348 )
2349
2350 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002351 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2352 {0: a, 1: b}),
2353 ("ab\\x02", 3)
2354 )
2355
2356 self.assertEqual(
2357 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2358 {0: a, 1: b, 2: 0xFFFE}),
2359 ("ab\\x02", 3)
2360 )
2361
2362 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002363 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2364 {0: a, 1: b}),
2365 ("ab", 3)
2366 )
2367
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002368 self.assertEqual(
2369 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2370 {0: a, 1: b, 2: 0xFFFE}),
2371 ("ab", 3)
2372 )
2373
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002374
Thomas Wouters89f507f2006-12-13 04:49:30 +00002375class WithStmtTest(unittest.TestCase):
2376 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002377 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002378 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2379 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002380 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002381
2382 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002383 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002384 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002385 with codecs.StreamReaderWriter(f, info.streamreader,
2386 info.streamwriter, 'strict') as srw:
2387 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002388
Victor Stinnerf96418d2015-09-21 23:06:27 +02002389
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002390class TypesTest(unittest.TestCase):
2391 def test_decode_unicode(self):
2392 # Most decoders don't accept unicode input
2393 decoders = [
2394 codecs.utf_7_decode,
2395 codecs.utf_8_decode,
2396 codecs.utf_16_le_decode,
2397 codecs.utf_16_be_decode,
2398 codecs.utf_16_ex_decode,
2399 codecs.utf_32_decode,
2400 codecs.utf_32_le_decode,
2401 codecs.utf_32_be_decode,
2402 codecs.utf_32_ex_decode,
2403 codecs.latin_1_decode,
2404 codecs.ascii_decode,
2405 codecs.charmap_decode,
2406 ]
2407 if hasattr(codecs, "mbcs_decode"):
2408 decoders.append(codecs.mbcs_decode)
2409 for decoder in decoders:
2410 self.assertRaises(TypeError, decoder, "xxx")
2411
2412 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002413 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002414 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002415 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2416 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2417 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2418 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002419
Victor Stinnere3b47152011-12-09 20:49:49 +01002420 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2421 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002422 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2423 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002424
2425 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2426 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002427 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2428 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002429
Serhiy Storchakad6793772013-01-29 10:20:44 +02002430
2431class UnicodeEscapeTest(unittest.TestCase):
2432 def test_empty(self):
2433 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2434 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2435
2436 def test_raw_encode(self):
2437 encode = codecs.unicode_escape_encode
2438 for b in range(32, 127):
2439 if b != b'\\'[0]:
2440 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2441
2442 def test_raw_decode(self):
2443 decode = codecs.unicode_escape_decode
2444 for b in range(256):
2445 if b != b'\\'[0]:
2446 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2447
2448 def test_escape_encode(self):
2449 encode = codecs.unicode_escape_encode
2450 check = coding_checker(self, encode)
2451 check('\t', br'\t')
2452 check('\n', br'\n')
2453 check('\r', br'\r')
2454 check('\\', br'\\')
2455 for b in range(32):
2456 if chr(b) not in '\t\n\r':
2457 check(chr(b), ('\\x%02x' % b).encode())
2458 for b in range(127, 256):
2459 check(chr(b), ('\\x%02x' % b).encode())
2460 check('\u20ac', br'\u20ac')
2461 check('\U0001d120', br'\U0001d120')
2462
2463 def test_escape_decode(self):
2464 decode = codecs.unicode_escape_decode
2465 check = coding_checker(self, decode)
2466 check(b"[\\\n]", "[]")
2467 check(br'[\"]', '["]')
2468 check(br"[\']", "[']")
2469 check(br"[\\]", r"[\]")
2470 check(br"[\a]", "[\x07]")
2471 check(br"[\b]", "[\x08]")
2472 check(br"[\t]", "[\x09]")
2473 check(br"[\n]", "[\x0a]")
2474 check(br"[\v]", "[\x0b]")
2475 check(br"[\f]", "[\x0c]")
2476 check(br"[\r]", "[\x0d]")
2477 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002478 check(br"[\78]", "[\x078]")
2479 check(br"[\41]", "[!]")
2480 check(br"[\418]", "[!8]")
2481 check(br"[\101]", "[A]")
2482 check(br"[\1010]", "[A0]")
2483 check(br"[\x41]", "[A]")
2484 check(br"[\x410]", "[A0]")
2485 check(br"\u20ac", "\u20ac")
2486 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002487 for i in range(97, 123):
2488 b = bytes([i])
2489 if b not in b'abfnrtuvx':
2490 with self.assertWarns(DeprecationWarning):
2491 check(b"\\" + b, "\\" + chr(i))
2492 if b.upper() not in b'UN':
2493 with self.assertWarns(DeprecationWarning):
2494 check(b"\\" + b.upper(), "\\" + chr(i-32))
2495 with self.assertWarns(DeprecationWarning):
2496 check(br"\8", "\\8")
2497 with self.assertWarns(DeprecationWarning):
2498 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002499 with self.assertWarns(DeprecationWarning):
2500 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002501
2502 def test_decode_errors(self):
2503 decode = codecs.unicode_escape_decode
2504 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2505 for i in range(d):
2506 self.assertRaises(UnicodeDecodeError, decode,
2507 b"\\" + c + b"0"*i)
2508 self.assertRaises(UnicodeDecodeError, decode,
2509 b"[\\" + c + b"0"*i + b"]")
2510 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2511 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2512 self.assertEqual(decode(data, "replace"),
2513 ("[\ufffd]\ufffd", len(data)))
2514 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2515 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2516 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2517
2518
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002519class RawUnicodeEscapeTest(unittest.TestCase):
2520 def test_empty(self):
2521 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2522 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2523
2524 def test_raw_encode(self):
2525 encode = codecs.raw_unicode_escape_encode
2526 for b in range(256):
2527 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2528
2529 def test_raw_decode(self):
2530 decode = codecs.raw_unicode_escape_decode
2531 for b in range(256):
2532 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2533
2534 def test_escape_encode(self):
2535 encode = codecs.raw_unicode_escape_encode
2536 check = coding_checker(self, encode)
2537 for b in range(256):
2538 if b not in b'uU':
2539 check('\\' + chr(b), b'\\' + bytes([b]))
2540 check('\u20ac', br'\u20ac')
2541 check('\U0001d120', br'\U0001d120')
2542
2543 def test_escape_decode(self):
2544 decode = codecs.raw_unicode_escape_decode
2545 check = coding_checker(self, decode)
2546 for b in range(256):
2547 if b not in b'uU':
2548 check(b'\\' + bytes([b]), '\\' + chr(b))
2549 check(br"\u20ac", "\u20ac")
2550 check(br"\U0001d120", "\U0001d120")
2551
2552 def test_decode_errors(self):
2553 decode = codecs.raw_unicode_escape_decode
2554 for c, d in (b'u', 4), (b'U', 4):
2555 for i in range(d):
2556 self.assertRaises(UnicodeDecodeError, decode,
2557 b"\\" + c + b"0"*i)
2558 self.assertRaises(UnicodeDecodeError, decode,
2559 b"[\\" + c + b"0"*i + b"]")
2560 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2561 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2562 self.assertEqual(decode(data, "replace"),
2563 ("[\ufffd]\ufffd", len(data)))
2564 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2565 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2566 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2567
2568
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002569class EscapeEncodeTest(unittest.TestCase):
2570
2571 def test_escape_encode(self):
2572 tests = [
2573 (b'', (b'', 0)),
2574 (b'foobar', (b'foobar', 6)),
2575 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2576 (b'a\'b', (b"a\\'b", 3)),
2577 (b'b\\c', (b'b\\\\c', 3)),
2578 (b'c\nd', (b'c\\nd', 3)),
2579 (b'd\re', (b'd\\re', 3)),
2580 (b'f\x7fg', (b'f\\x7fg', 3)),
2581 ]
2582 for data, output in tests:
2583 with self.subTest(data=data):
2584 self.assertEqual(codecs.escape_encode(data), output)
2585 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2586 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2587
2588
Martin v. Löwis43c57782009-05-10 08:15:24 +00002589class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002590
2591 def test_utf8(self):
2592 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002593 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002594 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002595 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002596 b"foo\x80bar")
2597 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002598 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002599 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002600 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002601 b"\xed\xb0\x80")
2602
2603 def test_ascii(self):
2604 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002605 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002606 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002607 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002608 b"foo\x80bar")
2609
2610 def test_charmap(self):
2611 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002612 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002613 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002614 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002615 b"foo\xa5bar")
2616
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002617 def test_latin1(self):
2618 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002619 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002620 b"\xe4\xeb\xef\xf6\xfc")
2621
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002622
Victor Stinner3fed0872010-05-22 02:16:27 +00002623class BomTest(unittest.TestCase):
2624 def test_seek0(self):
2625 data = "1234567890"
2626 tests = ("utf-16",
2627 "utf-16-le",
2628 "utf-16-be",
2629 "utf-32",
2630 "utf-32-le",
2631 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002632 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002633 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002634 # Check if the BOM is written only once
2635 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002636 f.write(data)
2637 f.write(data)
2638 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002639 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002640 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002641 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002642
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002643 # Check that the BOM is written after a seek(0)
2644 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2645 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002646 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002647 f.seek(0)
2648 f.write(data)
2649 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002650 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002651
2652 # (StreamWriter) Check that the BOM is written after a seek(0)
2653 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002654 f.writer.write(data[0])
2655 self.assertNotEqual(f.writer.tell(), 0)
2656 f.writer.seek(0)
2657 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002658 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002659 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002660
Victor Stinner05010702011-05-27 16:50:40 +02002661 # Check that the BOM is not written after a seek() at a position
2662 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002663 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2664 f.write(data)
2665 f.seek(f.tell())
2666 f.write(data)
2667 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002668 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002669
Victor Stinner05010702011-05-27 16:50:40 +02002670 # (StreamWriter) Check that the BOM is not written after a seek()
2671 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002672 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002673 f.writer.write(data)
2674 f.writer.seek(f.writer.tell())
2675 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002676 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002677 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002678
Victor Stinner3fed0872010-05-22 02:16:27 +00002679
Georg Brandl02524622010-12-02 18:06:51 +00002680bytes_transform_encodings = [
2681 "base64_codec",
2682 "uu_codec",
2683 "quopri_codec",
2684 "hex_codec",
2685]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002686
2687transform_aliases = {
2688 "base64_codec": ["base64", "base_64"],
2689 "uu_codec": ["uu"],
2690 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2691 "hex_codec": ["hex"],
2692 "rot_13": ["rot13"],
2693}
2694
Georg Brandl02524622010-12-02 18:06:51 +00002695try:
2696 import zlib
2697except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002698 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002699else:
2700 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002701 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002702try:
2703 import bz2
2704except ImportError:
2705 pass
2706else:
2707 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002708 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002709
Victor Stinnerf96418d2015-09-21 23:06:27 +02002710
Georg Brandl02524622010-12-02 18:06:51 +00002711class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002712
Georg Brandl02524622010-12-02 18:06:51 +00002713 def test_basics(self):
2714 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002715 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002716 with self.subTest(encoding=encoding):
2717 # generic codecs interface
2718 (o, size) = codecs.getencoder(encoding)(binput)
2719 self.assertEqual(size, len(binput))
2720 (i, size) = codecs.getdecoder(encoding)(o)
2721 self.assertEqual(size, len(o))
2722 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002723
Georg Brandl02524622010-12-02 18:06:51 +00002724 def test_read(self):
2725 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002726 with self.subTest(encoding=encoding):
2727 sin = codecs.encode(b"\x80", encoding)
2728 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2729 sout = reader.read()
2730 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002731
2732 def test_readline(self):
2733 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002734 with self.subTest(encoding=encoding):
2735 sin = codecs.encode(b"\x80", encoding)
2736 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2737 sout = reader.readline()
2738 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002739
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002740 def test_buffer_api_usage(self):
2741 # We check all the transform codecs accept memoryview input
2742 # for encoding and decoding
2743 # and also that they roundtrip correctly
2744 original = b"12345\x80"
2745 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002746 with self.subTest(encoding=encoding):
2747 data = original
2748 view = memoryview(data)
2749 data = codecs.encode(data, encoding)
2750 view_encoded = codecs.encode(view, encoding)
2751 self.assertEqual(view_encoded, data)
2752 view = memoryview(data)
2753 data = codecs.decode(data, encoding)
2754 self.assertEqual(data, original)
2755 view_decoded = codecs.decode(view, encoding)
2756 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002757
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002758 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002759 # Check binary -> binary codecs give a good error for str input
2760 bad_input = "bad input type"
2761 for encoding in bytes_transform_encodings:
2762 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002763 fmt = (r"{!r} is not a text encoding; "
2764 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002765 msg = fmt.format(encoding)
2766 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002767 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002768 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002769
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002770 def test_text_to_binary_blacklists_text_transforms(self):
2771 # Check str.encode gives a good error message for str -> str codecs
2772 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002773 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002774 with self.assertRaisesRegex(LookupError, msg):
2775 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002776
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002777 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002778 # Check bytes.decode and bytearray.decode give a good error
2779 # message for binary -> binary codecs
2780 data = b"encode first to ensure we meet any format restrictions"
2781 for encoding in bytes_transform_encodings:
2782 with self.subTest(encoding=encoding):
2783 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002784 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002785 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002786 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002787 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002788 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002789 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002790 bytearray(encoded_data).decode(encoding)
2791
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002792 def test_binary_to_text_blacklists_text_transforms(self):
2793 # Check str -> str codec gives a good error for binary input
2794 for bad_input in (b"immutable", bytearray(b"mutable")):
2795 with self.subTest(bad_input=bad_input):
2796 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002797 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002798 with self.assertRaisesRegex(LookupError, msg) as failure:
2799 bad_input.decode("rot_13")
2800 self.assertIsNone(failure.exception.__cause__)
2801
Zachary Wareefa2e042013-12-30 14:54:11 -06002802 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002803 def test_custom_zlib_error_is_wrapped(self):
2804 # Check zlib codec gives a good error for malformed input
2805 msg = "^decoding with 'zlib_codec' codec failed"
2806 with self.assertRaisesRegex(Exception, msg) as failure:
2807 codecs.decode(b"hello", "zlib_codec")
2808 self.assertIsInstance(failure.exception.__cause__,
2809 type(failure.exception))
2810
2811 def test_custom_hex_error_is_wrapped(self):
2812 # Check hex codec gives a good error for malformed input
2813 msg = "^decoding with 'hex_codec' codec failed"
2814 with self.assertRaisesRegex(Exception, msg) as failure:
2815 codecs.decode(b"hello", "hex_codec")
2816 self.assertIsInstance(failure.exception.__cause__,
2817 type(failure.exception))
2818
2819 # Unfortunately, the bz2 module throws OSError, which the codec
2820 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002821
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002822 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2823 def test_aliases(self):
2824 for codec_name, aliases in transform_aliases.items():
2825 expected_name = codecs.lookup(codec_name).name
2826 for alias in aliases:
2827 with self.subTest(alias=alias):
2828 info = codecs.lookup(alias)
2829 self.assertEqual(info.name, expected_name)
2830
Martin Panter06171bd2015-09-12 00:34:28 +00002831 def test_quopri_stateless(self):
2832 # Should encode with quotetabs=True
2833 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2834 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2835 # But should still support unescaped tabs and spaces
2836 unescaped = b"space tab eol\n"
2837 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2838
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002839 def test_uu_invalid(self):
2840 # Missing "begin" line
2841 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2842
Nick Coghlan8b097b42013-11-13 23:49:21 +10002843
2844# The codec system tries to wrap exceptions in order to ensure the error
2845# mentions the operation being performed and the codec involved. We
2846# currently *only* want this to happen for relatively stateless
2847# exceptions, where the only significant information they contain is their
2848# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002849
2850# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002851# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002852_TEST_CODECS = {}
2853
2854def _get_test_codec(codec_name):
2855 return _TEST_CODECS.get(codec_name)
2856codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2857
Nick Coghlan8fad1672014-09-15 23:50:44 +12002858try:
2859 # Issue #22166: Also need to clear the internal cache in CPython
2860 from _codecs import _forget_codec
2861except ImportError:
2862 def _forget_codec(codec_name):
2863 pass
2864
2865
Nick Coghlan8b097b42013-11-13 23:49:21 +10002866class ExceptionChainingTest(unittest.TestCase):
2867
2868 def setUp(self):
2869 # There's no way to unregister a codec search function, so we just
2870 # ensure we render this one fairly harmless after the test
2871 # case finishes by using the test case repr as the codec name
2872 # The codecs module normalizes codec names, although this doesn't
2873 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002874 # We also make sure we use a truly unique id for the custom codec
2875 # to avoid issues with the codec cache when running these tests
2876 # multiple times (e.g. when hunting for refleaks)
2877 unique_id = repr(self) + str(id(self))
2878 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2879
2880 # We store the object to raise on the instance because of a bad
2881 # interaction between the codec caching (which means we can't
2882 # recreate the codec entry) and regrtest refleak hunting (which
2883 # runs the same test instance multiple times). This means we
2884 # need to ensure the codecs call back in to the instance to find
2885 # out which exception to raise rather than binding them in a
2886 # closure to an object that may change on the next run
2887 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002888
Nick Coghlan4e553e22013-11-16 00:35:34 +10002889 def tearDown(self):
2890 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002891 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2892 encodings._cache.pop(self.codec_name, None)
2893 try:
2894 _forget_codec(self.codec_name)
2895 except KeyError:
2896 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002897
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002898 def set_codec(self, encode, decode):
2899 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002900 name=self.codec_name)
2901 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002902
2903 @contextlib.contextmanager
2904 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002905 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002906 operation, self.codec_name, exc_type.__name__, msg)
2907 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2908 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002909 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002910 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002911
2912 def raise_obj(self, *args, **kwds):
2913 # Helper to dynamically change the object raised by a test codec
2914 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002915
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002916 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002917 self.obj_to_raise = obj_to_raise
2918 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002919 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002920 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002921 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002922 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002923 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002924 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002925 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002926 codecs.decode(b"bytes input", self.codec_name)
2927
2928 def test_raise_by_type(self):
2929 self.check_wrapped(RuntimeError, "")
2930
2931 def test_raise_by_value(self):
2932 msg = "This should be wrapped"
2933 self.check_wrapped(RuntimeError(msg), msg)
2934
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002935 def test_raise_grandchild_subclass_exact_size(self):
2936 msg = "This should be wrapped"
2937 class MyRuntimeError(RuntimeError):
2938 __slots__ = ()
2939 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2940
2941 def test_raise_subclass_with_weakref_support(self):
2942 msg = "This should be wrapped"
2943 class MyRuntimeError(RuntimeError):
2944 pass
2945 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2946
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002947 def check_not_wrapped(self, obj_to_raise, msg):
2948 def raise_obj(*args, **kwds):
2949 raise obj_to_raise
2950 self.set_codec(raise_obj, raise_obj)
2951 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002952 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002953 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002954 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002955 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002956 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002957 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002958 codecs.decode(b"bytes input", self.codec_name)
2959
2960 def test_init_override_is_not_wrapped(self):
2961 class CustomInit(RuntimeError):
2962 def __init__(self):
2963 pass
2964 self.check_not_wrapped(CustomInit, "")
2965
2966 def test_new_override_is_not_wrapped(self):
2967 class CustomNew(RuntimeError):
2968 def __new__(cls):
2969 return super().__new__(cls)
2970 self.check_not_wrapped(CustomNew, "")
2971
2972 def test_instance_attribute_is_not_wrapped(self):
2973 msg = "This should NOT be wrapped"
2974 exc = RuntimeError(msg)
2975 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002976 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002977
2978 def test_non_str_arg_is_not_wrapped(self):
2979 self.check_not_wrapped(RuntimeError(1), "1")
2980
2981 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002982 msg_re = r"^\('a', 'b', 'c'\)$"
2983 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002984
2985 # http://bugs.python.org/issue19609
2986 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002987 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002988 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002989 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002990 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002991 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002992 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002993 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002994 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002995 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002996 codecs.decode(b"bytes input", self.codec_name)
2997
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002998 def test_unflagged_non_text_codec_handling(self):
2999 # The stdlib non-text codecs are now marked so they're
3000 # pre-emptively skipped by the text model related methods
3001 # However, third party codecs won't be flagged, so we still make
3002 # sure the case where an inappropriate output type is produced is
3003 # handled appropriately
3004 def encode_to_str(*args, **kwds):
3005 return "not bytes!", 0
3006 def decode_to_bytes(*args, **kwds):
3007 return b"not str!", 0
3008 self.set_codec(encode_to_str, decode_to_bytes)
3009 # No input or output type checks on the codecs module functions
3010 encoded = codecs.encode(None, self.codec_name)
3011 self.assertEqual(encoded, "not bytes!")
3012 decoded = codecs.decode(None, self.codec_name)
3013 self.assertEqual(decoded, b"not str!")
3014 # Text model methods should complain
3015 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04003016 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003017 msg = fmt.format(self.codec_name)
3018 with self.assertRaisesRegex(TypeError, msg):
3019 "str_input".encode(self.codec_name)
3020 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003021 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003022 msg = fmt.format(self.codec_name)
3023 with self.assertRaisesRegex(TypeError, msg):
3024 b"bytes input".decode(self.codec_name)
3025
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003026
Georg Brandl02524622010-12-02 18:06:51 +00003027
Victor Stinner62be4fb2011-10-18 21:46:37 +02003028@unittest.skipUnless(sys.platform == 'win32',
3029 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003030class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003031 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003032 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003033
Victor Stinner3a50e702011-10-18 21:21:00 +02003034 def test_invalid_code_page(self):
3035 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3036 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003037 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3038 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003039
3040 def test_code_page_name(self):
3041 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3042 codecs.code_page_encode, 932, '\xff')
3043 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003044 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003045 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003046 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003047
3048 def check_decode(self, cp, tests):
3049 for raw, errors, expected in tests:
3050 if expected is not None:
3051 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003052 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003053 except UnicodeDecodeError as err:
3054 self.fail('Unable to decode %a from "cp%s" with '
3055 'errors=%r: %s' % (raw, cp, errors, err))
3056 self.assertEqual(decoded[0], expected,
3057 '%a.decode("cp%s", %r)=%a != %a'
3058 % (raw, cp, errors, decoded[0], expected))
3059 # assert 0 <= decoded[1] <= len(raw)
3060 self.assertGreaterEqual(decoded[1], 0)
3061 self.assertLessEqual(decoded[1], len(raw))
3062 else:
3063 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003064 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003065
3066 def check_encode(self, cp, tests):
3067 for text, errors, expected in tests:
3068 if expected is not None:
3069 try:
3070 encoded = codecs.code_page_encode(cp, text, errors)
3071 except UnicodeEncodeError as err:
3072 self.fail('Unable to encode %a to "cp%s" with '
3073 'errors=%r: %s' % (text, cp, errors, err))
3074 self.assertEqual(encoded[0], expected,
3075 '%a.encode("cp%s", %r)=%a != %a'
3076 % (text, cp, errors, encoded[0], expected))
3077 self.assertEqual(encoded[1], len(text))
3078 else:
3079 self.assertRaises(UnicodeEncodeError,
3080 codecs.code_page_encode, cp, text, errors)
3081
3082 def test_cp932(self):
3083 self.check_encode(932, (
3084 ('abc', 'strict', b'abc'),
3085 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003086 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003087 ('\xff', 'strict', None),
3088 ('[\xff]', 'ignore', b'[]'),
3089 ('[\xff]', 'replace', b'[y]'),
3090 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003091 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003092 ('[\xff]', 'namereplace',
3093 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003094 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003095 ('\udcff', 'strict', None),
3096 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3097 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003098 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003099 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003100 (b'abc', 'strict', 'abc'),
3101 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3102 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003103 (b'[\xff]', 'strict', None),
3104 (b'[\xff]', 'ignore', '[]'),
3105 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003106 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003107 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003108 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003109 (b'\x81\x00abc', 'strict', None),
3110 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003111 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003112 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003113 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003114
3115 def test_cp1252(self):
3116 self.check_encode(1252, (
3117 ('abc', 'strict', b'abc'),
3118 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3119 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003120 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003121 ('\u0141', 'strict', None),
3122 ('\u0141', 'ignore', b''),
3123 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003124 ('\udc98', 'surrogateescape', b'\x98'),
3125 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003126 ))
3127 self.check_decode(1252, (
3128 (b'abc', 'strict', 'abc'),
3129 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3130 (b'\xff', 'strict', '\xff'),
3131 ))
3132
3133 def test_cp_utf7(self):
3134 cp = 65000
3135 self.check_encode(cp, (
3136 ('abc', 'strict', b'abc'),
3137 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3138 ('\U0010ffff', 'strict', b'+2//f/w-'),
3139 ('\udc80', 'strict', b'+3IA-'),
3140 ('\ufffd', 'strict', b'+//0-'),
3141 ))
3142 self.check_decode(cp, (
3143 (b'abc', 'strict', 'abc'),
3144 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3145 (b'+2//f/w-', 'strict', '\U0010ffff'),
3146 (b'+3IA-', 'strict', '\udc80'),
3147 (b'+//0-', 'strict', '\ufffd'),
3148 # invalid bytes
3149 (b'[+/]', 'strict', '[]'),
3150 (b'[\xff]', 'strict', '[\xff]'),
3151 ))
3152
Victor Stinner3a50e702011-10-18 21:21:00 +02003153 def test_multibyte_encoding(self):
3154 self.check_decode(932, (
3155 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3156 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3157 ))
3158 self.check_decode(self.CP_UTF8, (
3159 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3160 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3161 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003162 self.check_encode(self.CP_UTF8, (
3163 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3164 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3165 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003166
3167 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003168 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3169 self.assertEqual(decoded, ('', 0))
3170
Victor Stinner3a50e702011-10-18 21:21:00 +02003171 decoded = codecs.code_page_decode(932,
3172 b'\xe9\x80\xe9', 'strict',
3173 False)
3174 self.assertEqual(decoded, ('\u9a3e', 2))
3175
3176 decoded = codecs.code_page_decode(932,
3177 b'\xe9\x80\xe9\x80', 'strict',
3178 False)
3179 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3180
3181 decoded = codecs.code_page_decode(932,
3182 b'abc', 'strict',
3183 False)
3184 self.assertEqual(decoded, ('abc', 3))
3185
Steve Dowerf5aba582016-09-06 19:42:27 -07003186 def test_mbcs_alias(self):
3187 # Check that looking up our 'default' codepage will return
3188 # mbcs when we don't have a more specific one available
Victor Stinner91106cd2017-12-13 12:29:09 +01003189 with mock.patch('_winapi.GetACP', return_value=123):
Steve Dowerf5aba582016-09-06 19:42:27 -07003190 codec = codecs.lookup('cp123')
3191 self.assertEqual(codec.name, 'mbcs')
Steve Dowerf5aba582016-09-06 19:42:27 -07003192
Victor Stinner3a50e702011-10-18 21:21:00 +02003193
Victor Stinnerf96418d2015-09-21 23:06:27 +02003194class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003195 def test_encode(self):
3196 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3197
3198 def test_encode_error(self):
3199 for data, error_handler, expected in (
3200 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3201 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3202 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003203 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3204 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003205 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3206 ):
3207 with self.subTest(data=data, error_handler=error_handler,
3208 expected=expected):
3209 self.assertEqual(data.encode('ascii', error_handler),
3210 expected)
3211
3212 def test_encode_surrogateescape_error(self):
3213 with self.assertRaises(UnicodeEncodeError):
3214 # the first character can be decoded, but not the second
3215 '\udc80\xff'.encode('ascii', 'surrogateescape')
3216
Victor Stinnerf96418d2015-09-21 23:06:27 +02003217 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003218 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3219
3220 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003221 for data, error_handler, expected in (
3222 (b'[\x80\xff]', 'ignore', '[]'),
3223 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3224 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3225 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3226 ):
3227 with self.subTest(data=data, error_handler=error_handler,
3228 expected=expected):
3229 self.assertEqual(data.decode('ascii', error_handler),
3230 expected)
3231
3232
Victor Stinnerc3713e92015-09-29 12:32:13 +02003233class Latin1Test(unittest.TestCase):
3234 def test_encode(self):
3235 for data, expected in (
3236 ('abc', b'abc'),
3237 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3238 ):
3239 with self.subTest(data=data, expected=expected):
3240 self.assertEqual(data.encode('latin1'), expected)
3241
3242 def test_encode_errors(self):
3243 for data, error_handler, expected in (
3244 ('[\u20ac\udc80]', 'ignore', b'[]'),
3245 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003246 ('[\u20ac\U000abcde]', 'backslashreplace',
3247 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003248 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3249 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3250 ):
3251 with self.subTest(data=data, error_handler=error_handler,
3252 expected=expected):
3253 self.assertEqual(data.encode('latin1', error_handler),
3254 expected)
3255
3256 def test_encode_surrogateescape_error(self):
3257 with self.assertRaises(UnicodeEncodeError):
3258 # the first character can be decoded, but not the second
3259 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3260
3261 def test_decode(self):
3262 for data, expected in (
3263 (b'abc', 'abc'),
3264 (b'[\x80\xff]', '[\x80\xff]'),
3265 ):
3266 with self.subTest(data=data, expected=expected):
3267 self.assertEqual(data.decode('latin1'), expected)
3268
3269
Victor Stinner3d4226a2018-08-29 22:21:32 +02003270@unittest.skipIf(_testcapi is None, 'need _testcapi module')
3271class LocaleCodecTest(unittest.TestCase):
3272 """
3273 Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3274 """
3275 ENCODING = sys.getfilesystemencoding()
3276 STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3277 "u255:\xff",
3278 "UCS:\xe9\u20ac\U0010ffff",
3279 "surrogates:\uDC80\uDCFF")
3280 BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3281 SURROGATES = "\uDC80\uDCFF"
3282
3283 def encode(self, text, errors="strict"):
3284 return _testcapi.EncodeLocaleEx(text, 0, errors)
3285
3286 def check_encode_strings(self, errors):
3287 for text in self.STRINGS:
3288 with self.subTest(text=text):
3289 try:
3290 expected = text.encode(self.ENCODING, errors)
3291 except UnicodeEncodeError:
3292 with self.assertRaises(RuntimeError) as cm:
3293 self.encode(self.SURROGATES)
3294 errmsg = str(cm.exception)
3295 self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
3296 else:
3297 encoded = self.encode(text, errors)
3298 self.assertEqual(encoded, expected)
3299
3300 def test_encode_strict(self):
3301 self.check_encode_strings("strict")
3302
3303 def test_encode_surrogateescape(self):
3304 self.check_encode_strings("surrogateescape")
3305
3306 def test_encode_surrogatepass(self):
3307 try:
3308 self.encode('', 'surrogatepass')
3309 except ValueError as exc:
3310 if str(exc) == 'unsupported error handler':
3311 self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3312 f"surrogatepass error handler")
3313 else:
3314 raise
3315
3316 self.check_encode_strings("surrogatepass")
3317
3318 def decode(self, encoded, errors="strict"):
3319 return _testcapi.DecodeLocaleEx(encoded, 0, errors)
3320
3321 def check_decode_strings(self, errors):
3322 is_utf8 = (self.ENCODING == "utf-8")
3323 if is_utf8:
3324 encode_errors = 'surrogateescape'
3325 else:
3326 encode_errors = 'strict'
3327
3328 strings = list(self.BYTES_STRINGS)
3329 for text in self.STRINGS:
3330 try:
3331 encoded = text.encode(self.ENCODING, encode_errors)
3332 if encoded not in strings:
3333 strings.append(encoded)
3334 except UnicodeEncodeError:
3335 encoded = None
3336
3337 if is_utf8:
3338 encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3339 if encoded2 != encoded:
3340 strings.append(encoded2)
3341
3342 for encoded in strings:
3343 with self.subTest(encoded=encoded):
3344 try:
3345 expected = encoded.decode(self.ENCODING, errors)
3346 except UnicodeDecodeError:
3347 with self.assertRaises(RuntimeError) as cm:
3348 self.decode(encoded, errors)
3349 errmsg = str(cm.exception)
3350 self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3351 else:
3352 decoded = self.decode(encoded, errors)
3353 self.assertEqual(decoded, expected)
3354
3355 def test_decode_strict(self):
3356 self.check_decode_strings("strict")
3357
3358 def test_decode_surrogateescape(self):
3359 self.check_decode_strings("surrogateescape")
3360
3361 def test_decode_surrogatepass(self):
3362 try:
3363 self.decode(b'', 'surrogatepass')
3364 except ValueError as exc:
3365 if str(exc) == 'unsupported error handler':
3366 self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3367 f"surrogatepass error handler")
3368 else:
3369 raise
3370
3371 self.check_decode_strings("surrogatepass")
3372
3373
Fred Drake2e2be372001-09-20 21:33:42 +00003374if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003375 unittest.main()