blob: eb21a3915b938a96664ea68fb56f0e5545bcd673 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01008
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Antoine Pitrou00b2c862011-10-05 13:01:41 +020011try:
12 import ctypes
13except ImportError:
14 ctypes = None
15 SIZEOF_WCHAR_T = -1
16else:
17 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000018
Serhiy Storchakad6793772013-01-29 10:20:44 +020019def coding_checker(self, coder):
20 def check(input, expect):
21 self.assertEqual(coder(input), (expect, len(input)))
22 return check
23
Victor Stinnerf96418d2015-09-21 23:06:27 +020024
Walter Dörwald69652032004-09-07 20:24:22 +000025class Queue(object):
26 """
27 queue: write bytes at one end, read bytes from the other end
28 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000029 def __init__(self, buffer):
30 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000031
32 def write(self, chars):
33 self._buffer += chars
34
35 def read(self, size=-1):
36 if size<0:
37 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000038 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000039 return s
40 else:
41 s = self._buffer[:size]
42 self._buffer = self._buffer[size:]
43 return s
44
Victor Stinnerf96418d2015-09-21 23:06:27 +020045
Walter Dörwald3abcb012007-04-16 22:10:50 +000046class MixInCheckStateHandling:
47 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000048 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000049 d = codecs.getincrementaldecoder(encoding)()
50 part1 = d.decode(s[:i])
51 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000052 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 # Check that the condition stated in the documentation for
54 # IncrementalDecoder.getstate() holds
55 if not state[1]:
56 # reset decoder to the default state without anything buffered
57 d.setstate((state[0][:0], 0))
58 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000059 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000060 # The decoder must return to the same state
61 self.assertEqual(state, d.getstate())
62 # Create a new decoder and set it to the state
63 # we extracted from the old one
64 d = codecs.getincrementaldecoder(encoding)()
65 d.setstate(state)
66 part2 = d.decode(s[i:], True)
67 self.assertEqual(u, part1+part2)
68
69 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000070 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000071 d = codecs.getincrementalencoder(encoding)()
72 part1 = d.encode(u[:i])
73 state = d.getstate()
74 d = codecs.getincrementalencoder(encoding)()
75 d.setstate(state)
76 part2 = d.encode(u[i:], True)
77 self.assertEqual(s, part1+part2)
78
Victor Stinnerf96418d2015-09-21 23:06:27 +020079
Ezio Melotti5d3dba02013-01-11 06:02:07 +020080class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000082 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000083 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000084 # the StreamReader and check that the results equal the appropriate
85 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020087 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000091 result += r.read()
92 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000096
Martin Panter7462b6492015-11-02 03:37:02 +000097 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +000098 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000099 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 self.assertEqual(result, partialresult)
103 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 self.assertEqual(d.decode(b"", True), "")
105 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000106
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000109 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 self.assertEqual(result, partialresult)
113 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 self.assertEqual(d.decode(b"", True), "")
115 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000116
117 # check iterdecode()
118 encoded = input.encode(self.encoding)
119 self.assertEqual(
120 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000122 )
123
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 def test_readline(self):
125 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000126 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 return codecs.getreader(self.encoding)(stream)
128
Walter Dörwaldca199432006-03-06 22:39:12 +0000129 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200130 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 lines = []
132 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 if not line:
135 break
136 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
140 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
141 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000142 self.assertEqual(readalllines(s, True), sexpected)
143 self.assertEqual(readalllines(s, False), sexpectednoends)
144 self.assertEqual(readalllines(s, True, 10), sexpected)
145 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000146
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200147 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148 # Test long lines (multiple calls to read() in readline())
149 vw = []
150 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200151 for (i, lineend) in enumerate(lineends):
152 vw.append((i*200+200)*"\u3042" + lineend)
153 vwo.append((i*200+200)*"\u3042")
154 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
155 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000156
157 # Test lines where the first read might end with \r, so the
158 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200160 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000162 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000163 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000164 self.assertEqual(
165 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200168 self.assertEqual(
169 reader.readline(keepends=True),
170 "xxx\n",
171 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000173 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 self.assertEqual(
175 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000176 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200178 self.assertEqual(
179 reader.readline(keepends=False),
180 "xxx",
181 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000182
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200183 def test_mixed_readline_and_read(self):
184 lines = ["Humpty Dumpty sat on a wall,\n",
185 "Humpty Dumpty had a great fall.\r\n",
186 "All the king's horses and all the king's men\r",
187 "Couldn't put Humpty together again."]
188 data = ''.join(lines)
189 def getreader():
190 stream = io.BytesIO(data.encode(self.encoding))
191 return codecs.getreader(self.encoding)(stream)
192
193 # Issue #8260: Test readline() followed by read()
194 f = getreader()
195 self.assertEqual(f.readline(), lines[0])
196 self.assertEqual(f.read(), ''.join(lines[1:]))
197 self.assertEqual(f.read(), '')
198
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200199 # Issue #32110: Test readline() followed by read(n)
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.read(1), lines[1][0])
203 self.assertEqual(f.read(0), '')
204 self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
205
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200206 # Issue #16636: Test readline() followed by readlines()
207 f = getreader()
208 self.assertEqual(f.readline(), lines[0])
209 self.assertEqual(f.readlines(), lines[1:])
210 self.assertEqual(f.read(), '')
211
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200212 # Test read(n) followed by read()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200213 f = getreader()
214 self.assertEqual(f.read(size=40, chars=5), data[:5])
215 self.assertEqual(f.read(), data[5:])
216 self.assertEqual(f.read(), '')
217
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200218 # Issue #32110: Test read(n) followed by read(n)
219 f = getreader()
220 self.assertEqual(f.read(size=40, chars=5), data[:5])
221 self.assertEqual(f.read(1), data[5])
222 self.assertEqual(f.read(0), '')
223 self.assertEqual(f.read(100), data[6:106])
224
225 # Issue #12446: Test read(n) followed by readlines()
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200226 f = getreader()
227 self.assertEqual(f.read(size=40, chars=5), data[:5])
228 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
229 self.assertEqual(f.read(), '')
230
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000231 def test_bug1175396(self):
232 s = [
233 '<%!--===================================================\r\n',
234 ' BLOG index page: show recent articles,\r\n',
235 ' today\'s articles, or articles of a specific date.\r\n',
236 '========================================================--%>\r\n',
237 '<%@inputencoding="ISO-8859-1"%>\r\n',
238 '<%@pagetemplate=TEMPLATE.y%>\r\n',
239 '<%@import=import frog.util, frog%>\r\n',
240 '<%@import=import frog.objects%>\r\n',
241 '<%@import=from frog.storageerrors import StorageError%>\r\n',
242 '<%\r\n',
243 '\r\n',
244 'import logging\r\n',
245 'log=logging.getLogger("Snakelets.logger")\r\n',
246 '\r\n',
247 '\r\n',
248 'user=self.SessionCtx.user\r\n',
249 'storageEngine=self.SessionCtx.storageEngine\r\n',
250 '\r\n',
251 '\r\n',
252 'def readArticlesFromDate(date, count=None):\r\n',
253 ' entryids=storageEngine.listBlogEntries(date)\r\n',
254 ' entryids.reverse() # descending\r\n',
255 ' if count:\r\n',
256 ' entryids=entryids[:count]\r\n',
257 ' try:\r\n',
258 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
259 ' except StorageError,x:\r\n',
260 ' log.error("Error loading articles: "+str(x))\r\n',
261 ' self.abort("cannot load articles")\r\n',
262 '\r\n',
263 'showdate=None\r\n',
264 '\r\n',
265 'arg=self.Request.getArg()\r\n',
266 'if arg=="today":\r\n',
267 ' #-------------------- TODAY\'S ARTICLES\r\n',
268 ' self.write("<h2>Today\'s articles</h2>")\r\n',
269 ' showdate = frog.util.isodatestr() \r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'elif arg=="active":\r\n',
272 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
273 ' self.Yredirect("active.y")\r\n',
274 'elif arg=="login":\r\n',
275 ' #-------------------- LOGIN PAGE redirect\r\n',
276 ' self.Yredirect("login.y")\r\n',
277 'elif arg=="date":\r\n',
278 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
279 ' showdate = self.Request.getParameter("date")\r\n',
280 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
281 ' entries = readArticlesFromDate(showdate)\r\n',
282 'else:\r\n',
283 ' #-------------------- RECENT ARTICLES\r\n',
284 ' self.write("<h2>Recent articles</h2>")\r\n',
285 ' dates=storageEngine.listBlogEntryDates()\r\n',
286 ' if dates:\r\n',
287 ' entries=[]\r\n',
288 ' SHOWAMOUNT=10\r\n',
289 ' for showdate in dates:\r\n',
290 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
291 ' if len(entries)>=SHOWAMOUNT:\r\n',
292 ' break\r\n',
293 ' \r\n',
294 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000295 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200296 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000297 for (i, line) in enumerate(reader):
298 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000299
300 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000301 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200302 writer = codecs.getwriter(self.encoding)(q)
303 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304
305 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000306 writer.write("foo\r")
307 self.assertEqual(reader.readline(keepends=False), "foo")
308 writer.write("\nbar\r")
309 self.assertEqual(reader.readline(keepends=False), "")
310 self.assertEqual(reader.readline(keepends=False), "bar")
311 writer.write("baz")
312 self.assertEqual(reader.readline(keepends=False), "baz")
313 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000314
315 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000316 writer.write("foo\r")
317 self.assertEqual(reader.readline(keepends=True), "foo\r")
318 writer.write("\nbar\r")
319 self.assertEqual(reader.readline(keepends=True), "\n")
320 self.assertEqual(reader.readline(keepends=True), "bar\r")
321 writer.write("baz")
322 self.assertEqual(reader.readline(keepends=True), "baz")
323 self.assertEqual(reader.readline(keepends=True), "")
324 writer.write("foo\r\n")
325 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000326
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000328 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
329 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
330 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000331
332 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000333 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200334 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335 self.assertEqual(reader.readline(), s1)
336 self.assertEqual(reader.readline(), s2)
337 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000338 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339
340 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000341 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
342 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
343 s3 = "stillokay:bbbbxx\r\n"
344 s4 = "broken!!!!badbad\r\n"
345 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000346
347 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000348 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200349 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000350 self.assertEqual(reader.readline(), s1)
351 self.assertEqual(reader.readline(), s2)
352 self.assertEqual(reader.readline(), s3)
353 self.assertEqual(reader.readline(), s4)
354 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000355 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000356
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200357 ill_formed_sequence_replace = "\ufffd"
358
359 def test_lone_surrogates(self):
360 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
361 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
362 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200363 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
364 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200365 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
366 "[&#56448;]".encode(self.encoding))
367 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
368 "[]".encode(self.encoding))
369 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
370 "[?]".encode(self.encoding))
371
Victor Stinner01ada392015-10-01 21:54:51 +0200372 # sequential surrogate characters
373 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
374 "[]".encode(self.encoding))
375 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
376 "[??]".encode(self.encoding))
377
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200378 bom = "".encode(self.encoding)
379 for before, after in [("\U00010fff", "A"), ("[", "]"),
380 ("A", "\U00010fff")]:
381 before_sequence = before.encode(self.encoding)[len(bom):]
382 after_sequence = after.encode(self.encoding)[len(bom):]
383 test_string = before + "\uDC80" + after
384 test_sequence = (bom + before_sequence +
385 self.ill_formed_sequence + after_sequence)
386 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
387 self.encoding)
388 self.assertEqual(test_string.encode(self.encoding,
389 "surrogatepass"),
390 test_sequence)
391 self.assertEqual(test_sequence.decode(self.encoding,
392 "surrogatepass"),
393 test_string)
394 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
395 before + after)
396 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
397 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200398 backslashreplace = ''.join('\\x%02x' % b
399 for b in self.ill_formed_sequence)
400 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
401 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200402
Victor Stinnerf96418d2015-09-21 23:06:27 +0200403
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200404class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000405 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200406 if sys.byteorder == 'little':
407 ill_formed_sequence = b"\x80\xdc\x00\x00"
408 else:
409 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410
411 spamle = (b'\xff\xfe\x00\x00'
412 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
413 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
414 spambe = (b'\x00\x00\xfe\xff'
415 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
416 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
417
418 def test_only_one_bom(self):
419 _,_,reader,writer = codecs.lookup(self.encoding)
420 # encode some stream
421 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200422 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000423 f.write("spam")
424 f.write("spam")
425 d = s.getvalue()
426 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000427 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000428 # try to read it back
429 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200430 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000431 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000432
433 def test_badbom(self):
434 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200435 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000436 self.assertRaises(UnicodeError, f.read)
437
438 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200439 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000440 self.assertRaises(UnicodeError, f.read)
441
442 def test_partial(self):
443 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200444 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000445 [
446 "", # first byte of BOM read
447 "", # second byte of BOM read
448 "", # third byte of BOM read
449 "", # fourth byte of BOM read => byteorder known
450 "",
451 "",
452 "",
453 "\x00",
454 "\x00",
455 "\x00",
456 "\x00",
457 "\x00\xff",
458 "\x00\xff",
459 "\x00\xff",
460 "\x00\xff",
461 "\x00\xff\u0100",
462 "\x00\xff\u0100",
463 "\x00\xff\u0100",
464 "\x00\xff\u0100",
465 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200466 "\x00\xff\u0100\uffff",
467 "\x00\xff\u0100\uffff",
468 "\x00\xff\u0100\uffff",
469 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470 ]
471 )
472
Georg Brandl791f4e12009-09-17 11:41:24 +0000473 def test_handlers(self):
474 self.assertEqual(('\ufffd', 1),
475 codecs.utf_32_decode(b'\x01', 'replace', True))
476 self.assertEqual(('', 1),
477 codecs.utf_32_decode(b'\x01', 'ignore', True))
478
Walter Dörwald41980ca2007-08-16 21:55:45 +0000479 def test_errors(self):
480 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
481 b"\xff", "strict", True)
482
483 def test_decoder_state(self):
484 self.check_state_handling_decode(self.encoding,
485 "spamspam", self.spamle)
486 self.check_state_handling_decode(self.encoding,
487 "spamspam", self.spambe)
488
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000489 def test_issue8941(self):
490 # Issue #8941: insufficient result allocation when decoding into
491 # surrogate pairs on UCS-2 builds.
492 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
493 self.assertEqual('\U00010000' * 1024,
494 codecs.utf_32_decode(encoded_le)[0])
495 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
496 self.assertEqual('\U00010000' * 1024,
497 codecs.utf_32_decode(encoded_be)[0])
498
Victor Stinnerf96418d2015-09-21 23:06:27 +0200499
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200500class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000501 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200502 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000503
504 def test_partial(self):
505 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200506 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000507 [
508 "",
509 "",
510 "",
511 "\x00",
512 "\x00",
513 "\x00",
514 "\x00",
515 "\x00\xff",
516 "\x00\xff",
517 "\x00\xff",
518 "\x00\xff",
519 "\x00\xff\u0100",
520 "\x00\xff\u0100",
521 "\x00\xff\u0100",
522 "\x00\xff\u0100",
523 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200524 "\x00\xff\u0100\uffff",
525 "\x00\xff\u0100\uffff",
526 "\x00\xff\u0100\uffff",
527 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000528 ]
529 )
530
531 def test_simple(self):
532 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
533
534 def test_errors(self):
535 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
536 b"\xff", "strict", True)
537
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000538 def test_issue8941(self):
539 # Issue #8941: insufficient result allocation when decoding into
540 # surrogate pairs on UCS-2 builds.
541 encoded = b'\x00\x00\x01\x00' * 1024
542 self.assertEqual('\U00010000' * 1024,
543 codecs.utf_32_le_decode(encoded)[0])
544
Victor Stinnerf96418d2015-09-21 23:06:27 +0200545
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200546class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000547 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200548 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549
550 def test_partial(self):
551 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200552 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000553 [
554 "",
555 "",
556 "",
557 "\x00",
558 "\x00",
559 "\x00",
560 "\x00",
561 "\x00\xff",
562 "\x00\xff",
563 "\x00\xff",
564 "\x00\xff",
565 "\x00\xff\u0100",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100",
568 "\x00\xff\u0100",
569 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200570 "\x00\xff\u0100\uffff",
571 "\x00\xff\u0100\uffff",
572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000574 ]
575 )
576
577 def test_simple(self):
578 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
579
580 def test_errors(self):
581 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
582 b"\xff", "strict", True)
583
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000584 def test_issue8941(self):
585 # Issue #8941: insufficient result allocation when decoding into
586 # surrogate pairs on UCS-2 builds.
587 encoded = b'\x00\x01\x00\x00' * 1024
588 self.assertEqual('\U00010000' * 1024,
589 codecs.utf_32_be_decode(encoded)[0])
590
591
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200592class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000593 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200594 if sys.byteorder == 'little':
595 ill_formed_sequence = b"\x80\xdc"
596 else:
597 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000598
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000599 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
600 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000601
602 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000603 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000604 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000605 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200606 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 f.write("spam")
608 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000609 d = s.getvalue()
610 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000611 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000612 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000613 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200614 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000615 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000616
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000617 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000618 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200619 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000620 self.assertRaises(UnicodeError, f.read)
621
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000622 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200623 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000624 self.assertRaises(UnicodeError, f.read)
625
Walter Dörwald69652032004-09-07 20:24:22 +0000626 def test_partial(self):
627 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200628 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000629 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000630 "", # first byte of BOM read
631 "", # second byte of BOM read => byteorder known
632 "",
633 "\x00",
634 "\x00",
635 "\x00\xff",
636 "\x00\xff",
637 "\x00\xff\u0100",
638 "\x00\xff\u0100",
639 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200640 "\x00\xff\u0100\uffff",
641 "\x00\xff\u0100\uffff",
642 "\x00\xff\u0100\uffff",
643 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000644 ]
645 )
646
Georg Brandl791f4e12009-09-17 11:41:24 +0000647 def test_handlers(self):
648 self.assertEqual(('\ufffd', 1),
649 codecs.utf_16_decode(b'\x01', 'replace', True))
650 self.assertEqual(('', 1),
651 codecs.utf_16_decode(b'\x01', 'ignore', True))
652
Walter Dörwalde22d3392005-11-17 08:52:34 +0000653 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000654 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000655 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000656
657 def test_decoder_state(self):
658 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000659 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000660 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000661 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000662
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000663 def test_bug691291(self):
664 # Files are always opened in binary mode, even if no binary mode was
665 # specified. This means that no automatic conversion of '\n' is done
666 # on reading and writing.
667 s1 = 'Hello\r\nworld\r\n'
668
669 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200670 self.addCleanup(support.unlink, support.TESTFN)
671 with open(support.TESTFN, 'wb') as fp:
672 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200673 with support.check_warnings(('', DeprecationWarning)):
674 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
675 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200676 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000677
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200678class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000679 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200680 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000681
682 def test_partial(self):
683 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200684 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000685 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000686 "",
687 "\x00",
688 "\x00",
689 "\x00\xff",
690 "\x00\xff",
691 "\x00\xff\u0100",
692 "\x00\xff\u0100",
693 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200694 "\x00\xff\u0100\uffff",
695 "\x00\xff\u0100\uffff",
696 "\x00\xff\u0100\uffff",
697 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000698 ]
699 )
700
Walter Dörwalde22d3392005-11-17 08:52:34 +0000701 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200702 tests = [
703 (b'\xff', '\ufffd'),
704 (b'A\x00Z', 'A\ufffd'),
705 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
706 (b'\x00\xd8', '\ufffd'),
707 (b'\x00\xd8A', '\ufffd'),
708 (b'\x00\xd8A\x00', '\ufffdA'),
709 (b'\x00\xdcA\x00', '\ufffdA'),
710 ]
711 for raw, expected in tests:
712 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
713 raw, 'strict', True)
714 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000715
Victor Stinner53a9dd72010-12-08 22:25:45 +0000716 def test_nonbmp(self):
717 self.assertEqual("\U00010203".encode(self.encoding),
718 b'\x00\xd8\x03\xde')
719 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
720 "\U00010203")
721
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200722class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000723 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200724 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000725
726 def test_partial(self):
727 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200728 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000729 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000730 "",
731 "\x00",
732 "\x00",
733 "\x00\xff",
734 "\x00\xff",
735 "\x00\xff\u0100",
736 "\x00\xff\u0100",
737 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200738 "\x00\xff\u0100\uffff",
739 "\x00\xff\u0100\uffff",
740 "\x00\xff\u0100\uffff",
741 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000742 ]
743 )
744
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200746 tests = [
747 (b'\xff', '\ufffd'),
748 (b'\x00A\xff', 'A\ufffd'),
749 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
750 (b'\xd8\x00', '\ufffd'),
751 (b'\xd8\x00\xdc', '\ufffd'),
752 (b'\xd8\x00\x00A', '\ufffdA'),
753 (b'\xdc\x00\x00A', '\ufffdA'),
754 ]
755 for raw, expected in tests:
756 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
757 raw, 'strict', True)
758 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000759
Victor Stinner53a9dd72010-12-08 22:25:45 +0000760 def test_nonbmp(self):
761 self.assertEqual("\U00010203".encode(self.encoding),
762 b'\xd8\x00\xde\x03')
763 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
764 "\U00010203")
765
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200766class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000767 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200768 ill_formed_sequence = b"\xed\xb2\x80"
769 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200770 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000771
772 def test_partial(self):
773 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200774 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000775 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000776 "\x00",
777 "\x00",
778 "\x00\xff",
779 "\x00\xff",
780 "\x00\xff\u07ff",
781 "\x00\xff\u07ff",
782 "\x00\xff\u07ff",
783 "\x00\xff\u07ff\u0800",
784 "\x00\xff\u07ff\u0800",
785 "\x00\xff\u07ff\u0800",
786 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200787 "\x00\xff\u07ff\u0800\uffff",
788 "\x00\xff\u07ff\u0800\uffff",
789 "\x00\xff\u07ff\u0800\uffff",
790 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000791 ]
792 )
793
Walter Dörwald3abcb012007-04-16 22:10:50 +0000794 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000795 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000796 self.check_state_handling_decode(self.encoding,
797 u, u.encode(self.encoding))
798
Victor Stinner1d65d912015-10-05 13:43:50 +0200799 def test_decode_error(self):
800 for data, error_handler, expected in (
801 (b'[\x80\xff]', 'ignore', '[]'),
802 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
803 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
804 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
805 ):
806 with self.subTest(data=data, error_handler=error_handler,
807 expected=expected):
808 self.assertEqual(data.decode(self.encoding, error_handler),
809 expected)
810
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000811 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200812 super().test_lone_surrogates()
813 # not sure if this is making sense for
814 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200815 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
816 self.BOM + b'[\x80]')
817
818 with self.assertRaises(UnicodeEncodeError) as cm:
819 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
820 exc = cm.exception
821 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000822
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000823 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200824 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
825 self.BOM + b"abc\xed\xa0\x80def")
826 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
827 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
828 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
829 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
830
831 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000832 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200833 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200835
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000836 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700837 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200838 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200839 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200840 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000841
Victor Stinnerf96418d2015-09-21 23:06:27 +0200842
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200843@unittest.skipUnless(sys.platform == 'win32',
844 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200845class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200846 encoding = "cp65001"
847
848 def test_encode(self):
849 tests = [
850 ('abc', 'strict', b'abc'),
851 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
852 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700853 ('\udc80', 'strict', None),
854 ('\udc80', 'ignore', b''),
855 ('\udc80', 'replace', b'?'),
856 ('\udc80', 'backslashreplace', b'\\udc80'),
857 ('\udc80', 'namereplace', b'\\udc80'),
858 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200859 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200860 for text, errors, expected in tests:
861 if expected is not None:
862 try:
863 encoded = text.encode('cp65001', errors)
864 except UnicodeEncodeError as err:
865 self.fail('Unable to encode %a to cp65001 with '
866 'errors=%r: %s' % (text, errors, err))
867 self.assertEqual(encoded, expected,
868 '%a.encode("cp65001", %r)=%a != %a'
869 % (text, errors, encoded, expected))
870 else:
871 self.assertRaises(UnicodeEncodeError,
872 text.encode, "cp65001", errors)
873
874 def test_decode(self):
875 tests = [
876 (b'abc', 'strict', 'abc'),
877 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
878 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
879 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
880 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
881 # invalid bytes
882 (b'[\xff]', 'strict', None),
883 (b'[\xff]', 'ignore', '[]'),
884 (b'[\xff]', 'replace', '[\ufffd]'),
885 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700886 (b'[\xed\xb2\x80]', 'strict', None),
887 (b'[\xed\xb2\x80]', 'ignore', '[]'),
888 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200889 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200890 for raw, errors, expected in tests:
891 if expected is not None:
892 try:
893 decoded = raw.decode('cp65001', errors)
894 except UnicodeDecodeError as err:
895 self.fail('Unable to decode %a from cp65001 with '
896 'errors=%r: %s' % (raw, errors, err))
897 self.assertEqual(decoded, expected,
898 '%a.decode("cp65001", %r)=%a != %a'
899 % (raw, errors, decoded, expected))
900 else:
901 self.assertRaises(UnicodeDecodeError,
902 raw.decode, 'cp65001', errors)
903
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200904 def test_lone_surrogates(self):
905 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
906 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
907 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
908 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200909 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
910 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200911 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
912 b'[&#56448;]')
913 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
914 b'[\x80]')
915 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
916 b'[]')
917 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
918 b'[?]')
919
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200920 def test_surrogatepass_handler(self):
921 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
922 b"abc\xed\xa0\x80def")
923 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
924 "abc\ud800def")
925 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
926 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
927 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
928 "\U00010fff\uD800")
929 self.assertTrue(codecs.lookup_error("surrogatepass"))
930
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200931
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200932class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000933 encoding = "utf-7"
934
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300935 def test_ascii(self):
936 # Set D (directly encoded characters)
937 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
938 'abcdefghijklmnopqrstuvwxyz'
939 '0123456789'
940 '\'(),-./:?')
941 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
942 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
943 # Set O (optional direct characters)
944 set_o = ' !"#$%&*;<=>@[]^_`{|}'
945 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
946 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
947 # +
948 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
949 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
950 # White spaces
951 ws = ' \t\n\r'
952 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
953 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
954 # Other ASCII characters
955 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
956 set(set_d + set_o + '+' + ws)))
957 self.assertEqual(other_ascii.encode(self.encoding),
958 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
959 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
960
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000961 def test_partial(self):
962 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200963 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000964 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200965 'a',
966 'a',
967 'a+',
968 'a+-',
969 'a+-b',
970 'a+-b',
971 'a+-b',
972 'a+-b',
973 'a+-b',
974 'a+-b\x00',
975 'a+-b\x00c',
976 'a+-b\x00c',
977 'a+-b\x00c',
978 'a+-b\x00c',
979 'a+-b\x00c',
980 'a+-b\x00c\x80',
981 'a+-b\x00c\x80d',
982 'a+-b\x00c\x80d',
983 'a+-b\x00c\x80d',
984 'a+-b\x00c\x80d',
985 'a+-b\x00c\x80d',
986 'a+-b\x00c\x80d\u0100',
987 'a+-b\x00c\x80d\u0100e',
988 'a+-b\x00c\x80d\u0100e',
989 'a+-b\x00c\x80d\u0100e',
990 'a+-b\x00c\x80d\u0100e',
991 'a+-b\x00c\x80d\u0100e',
992 'a+-b\x00c\x80d\u0100e',
993 'a+-b\x00c\x80d\u0100e',
994 'a+-b\x00c\x80d\u0100e',
995 'a+-b\x00c\x80d\u0100e\U00010000',
996 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000997 ]
998 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000999
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001000 def test_errors(self):
1001 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001002 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001003 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001004 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001005 (b'a+IK', 'a\ufffd'),
1006 (b'a+IK-b', 'a\ufffdb'),
1007 (b'a+IK,b', 'a\ufffdb'),
1008 (b'a+IKx', 'a\u20ac\ufffd'),
1009 (b'a+IKx-b', 'a\u20ac\ufffdb'),
1010 (b'a+IKwgr', 'a\u20ac\ufffd'),
1011 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1012 (b'a+IKwgr,', 'a\u20ac\ufffd'),
1013 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1014 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1015 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1016 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1017 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1018 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1019 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001020 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1021 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001022 ]
1023 for raw, expected in tests:
1024 with self.subTest(raw=raw):
1025 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1026 raw, 'strict', True)
1027 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1028
1029 def test_nonbmp(self):
1030 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1031 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1032 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001033 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1034 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1035 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1036 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1037 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1038 b'+IKwgrNgB3KA-')
1039 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1040 '\u20ac\u20ac\U000104A0')
1041 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1042 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001043
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001044 def test_lone_surrogates(self):
1045 tests = [
1046 (b'a+2AE-b', 'a\ud801b'),
1047 (b'a+2AE\xffb', 'a\ufffdb'),
1048 (b'a+2AE', 'a\ufffd'),
1049 (b'a+2AEA-b', 'a\ufffdb'),
1050 (b'a+2AH-b', 'a\ufffdb'),
1051 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1052 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1053 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1054 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1055 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1056 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1057 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1058 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1059 ]
1060 for raw, expected in tests:
1061 with self.subTest(raw=raw):
1062 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001063
1064
Walter Dörwalde22d3392005-11-17 08:52:34 +00001065class UTF16ExTest(unittest.TestCase):
1066
1067 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001068 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001069
1070 def test_bad_args(self):
1071 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1072
1073class ReadBufferTest(unittest.TestCase):
1074
1075 def test_array(self):
1076 import array
1077 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001078 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001079 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001080 )
1081
1082 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001083 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001084
1085 def test_bad_args(self):
1086 self.assertRaises(TypeError, codecs.readbuffer_encode)
1087 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1088
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001089class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001090 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001091 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001092
1093 def test_partial(self):
1094 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001095 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001096 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001097 "",
1098 "",
1099 "", # First BOM has been read and skipped
1100 "",
1101 "",
1102 "\ufeff", # Second BOM has been read and emitted
1103 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001104 "\ufeff\x00", # First byte of encoded "\xff" read
1105 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1106 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1107 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001108 "\ufeff\x00\xff\u07ff",
1109 "\ufeff\x00\xff\u07ff",
1110 "\ufeff\x00\xff\u07ff\u0800",
1111 "\ufeff\x00\xff\u07ff\u0800",
1112 "\ufeff\x00\xff\u07ff\u0800",
1113 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001114 "\ufeff\x00\xff\u07ff\u0800\uffff",
1115 "\ufeff\x00\xff\u07ff\u0800\uffff",
1116 "\ufeff\x00\xff\u07ff\u0800\uffff",
1117 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001118 ]
1119 )
1120
Thomas Wouters89f507f2006-12-13 04:49:30 +00001121 def test_bug1601501(self):
1122 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001123 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001124
Walter Dörwald3abcb012007-04-16 22:10:50 +00001125 def test_bom(self):
1126 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001127 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001128 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1129
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001130 def test_stream_bom(self):
1131 unistring = "ABC\u00A1\u2200XYZ"
1132 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1133
1134 reader = codecs.getreader("utf-8-sig")
1135 for sizehint in [None] + list(range(1, 11)) + \
1136 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001137 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001138 ostream = io.StringIO()
1139 while 1:
1140 if sizehint is not None:
1141 data = istream.read(sizehint)
1142 else:
1143 data = istream.read()
1144
1145 if not data:
1146 break
1147 ostream.write(data)
1148
1149 got = ostream.getvalue()
1150 self.assertEqual(got, unistring)
1151
1152 def test_stream_bare(self):
1153 unistring = "ABC\u00A1\u2200XYZ"
1154 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1155
1156 reader = codecs.getreader("utf-8-sig")
1157 for sizehint in [None] + list(range(1, 11)) + \
1158 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001159 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001160 ostream = io.StringIO()
1161 while 1:
1162 if sizehint is not None:
1163 data = istream.read(sizehint)
1164 else:
1165 data = istream.read()
1166
1167 if not data:
1168 break
1169 ostream.write(data)
1170
1171 got = ostream.getvalue()
1172 self.assertEqual(got, unistring)
1173
1174class EscapeDecodeTest(unittest.TestCase):
1175 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001176 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001177 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001178
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001179 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001180 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001181 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001182 b = bytes([b])
1183 if b != b'\\':
1184 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001185
1186 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 decode = codecs.escape_decode
1188 check = coding_checker(self, decode)
1189 check(b"[\\\n]", b"[]")
1190 check(br'[\"]', b'["]')
1191 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001192 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001193 check(br"[\a]", b"[\x07]")
1194 check(br"[\b]", b"[\x08]")
1195 check(br"[\t]", b"[\x09]")
1196 check(br"[\n]", b"[\x0a]")
1197 check(br"[\v]", b"[\x0b]")
1198 check(br"[\f]", b"[\x0c]")
1199 check(br"[\r]", b"[\x0d]")
1200 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001201 check(br"[\78]", b"[\x078]")
1202 check(br"[\41]", b"[!]")
1203 check(br"[\418]", b"[!8]")
1204 check(br"[\101]", b"[A]")
1205 check(br"[\1010]", b"[A0]")
1206 check(br"[\501]", b"[A]")
1207 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001208 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001209 for i in range(97, 123):
1210 b = bytes([i])
1211 if b not in b'abfnrtvx':
1212 with self.assertWarns(DeprecationWarning):
1213 check(b"\\" + b, b"\\" + b)
1214 with self.assertWarns(DeprecationWarning):
1215 check(b"\\" + b.upper(), b"\\" + b.upper())
1216 with self.assertWarns(DeprecationWarning):
1217 check(br"\8", b"\\8")
1218 with self.assertWarns(DeprecationWarning):
1219 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001220 with self.assertWarns(DeprecationWarning):
1221 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001222
1223 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001224 decode = codecs.escape_decode
1225 self.assertRaises(ValueError, decode, br"\x")
1226 self.assertRaises(ValueError, decode, br"[\x]")
1227 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1228 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1229 self.assertRaises(ValueError, decode, br"\x0")
1230 self.assertRaises(ValueError, decode, br"[\x0]")
1231 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1232 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001233
Victor Stinnerf96418d2015-09-21 23:06:27 +02001234
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001235class RecodingTest(unittest.TestCase):
1236 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001237 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001238 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001239 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001240 f2.close()
1241 # Python used to crash on this at exit because of a refcount
1242 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001243
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001244 self.assertTrue(f.closed)
1245
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246# From RFC 3492
1247punycode_testcases = [
1248 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1250 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001251 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001253 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001254 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001255 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001256 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001257 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001258 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001259 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1260 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1261 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001262 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001264 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1265 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1266 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001267 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001268 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001269 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001270 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1271 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1272 "\u0939\u0948\u0902",
1273 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274
1275 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001276 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001277 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1278 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001279
1280 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001281 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1282 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1283 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001284 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1285 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286
1287 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001288 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1289 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1290 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1291 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001292 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293
1294 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001295 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1296 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1297 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1298 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1299 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001300 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301
1302 # (K) Vietnamese:
1303 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1304 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001305 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1306 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1307 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1308 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001309 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001312 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001313 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001314
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001316 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1317 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1318 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001319 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320
1321 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001322 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1323 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1324 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001325 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326
1327 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001328 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001329 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001330
1331 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001332 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1333 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001334 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335
1336 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001337 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001338 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339
1340 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001341 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001342 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001343
1344 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001345 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1346 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001347 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 ]
1349
1350for i in punycode_testcases:
1351 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001352 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353
Victor Stinnerf96418d2015-09-21 23:06:27 +02001354
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355class PunycodeTest(unittest.TestCase):
1356 def test_encode(self):
1357 for uni, puny in punycode_testcases:
1358 # Need to convert both strings to lower case, since
1359 # some of the extended encodings use upper case, but our
1360 # code produces only lower case. Converting just puny to
1361 # lower is also insufficient, since some of the input characters
1362 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001363 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001364 str(uni.encode("punycode"), "ascii").lower(),
1365 str(puny, "ascii").lower()
1366 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367
1368 def test_decode(self):
1369 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001370 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001371 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001372 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373
Victor Stinnerf96418d2015-09-21 23:06:27 +02001374
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001375class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001376 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001377 def test_bug1251300(self):
1378 # Decoding with unicode_internal used to not correctly handle "code
1379 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001380 ok = [
1381 (b"\x00\x10\xff\xff", "\U0010ffff"),
1382 (b"\x00\x00\x01\x01", "\U00000101"),
1383 (b"", ""),
1384 ]
1385 not_ok = [
1386 b"\x7f\xff\xff\xff",
1387 b"\x80\x00\x00\x00",
1388 b"\x81\x00\x00\x00",
1389 b"\x00",
1390 b"\x00\x00\x00\x00\x00",
1391 ]
1392 for internal, uni in ok:
1393 if sys.byteorder == "little":
1394 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001395 with support.check_warnings():
1396 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001397 for internal in not_ok:
1398 if sys.byteorder == "little":
1399 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001400 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001401 'deprecated', DeprecationWarning)):
1402 self.assertRaises(UnicodeDecodeError, internal.decode,
1403 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001404 if sys.byteorder == "little":
1405 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001406 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001407 else:
1408 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001409 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001410 with support.check_warnings():
1411 self.assertRaises(UnicodeDecodeError,
1412 invalid.decode, "unicode_internal")
1413 with support.check_warnings():
1414 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1415 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001416 with support.check_warnings():
1417 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1418 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001419
Victor Stinner182d90d2011-09-29 19:53:55 +02001420 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001421 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001422 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001423 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001424 'deprecated', DeprecationWarning)):
1425 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001426 except UnicodeDecodeError as ex:
1427 self.assertEqual("unicode_internal", ex.encoding)
1428 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1429 self.assertEqual(4, ex.start)
1430 self.assertEqual(8, ex.end)
1431 else:
1432 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001433
Victor Stinner182d90d2011-09-29 19:53:55 +02001434 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001435 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001436 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1437 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001438 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001439 'deprecated', DeprecationWarning)):
1440 ab = "ab".encode("unicode_internal").decode()
1441 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1442 "ascii"),
1443 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001444 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001445
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001446 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001447 with support.check_warnings(('unicode_internal codec has been '
1448 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001449 # Issue 3739
1450 encoder = codecs.getencoder("unicode_internal")
1451 self.assertEqual(encoder("a")[1], 1)
1452 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1453
1454 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001455
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1457nameprep_tests = [
1458 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1460 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1461 b'\xb8\x8f\xef\xbb\xbf',
1462 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'CAFE',
1465 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 # 3.3 Case folding 8bit U+00DF (german sharp s).
1467 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'\xc3\x9f',
1469 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'\xc4\xb0',
1472 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xc5\x83\xcd\xba',
1475 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1477 # XXX: skip this as it fails in UCS-2 mode
1478 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1479 # 'telc\xe2\x88\x95kg\xcf\x83'),
1480 (None, None),
1481 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1483 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001485 (b'\xe1\xbe\xb7',
1486 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 # 3.9 Self-reverting case folding U+01F0 and normalization.
1488 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'\xc7\xb0',
1490 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001491 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'\xce\x90',
1493 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001494 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001495 (b'\xce\xb0',
1496 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001498 (b'\xe1\xba\x96',
1499 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001500 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001501 (b'\xe1\xbd\x96',
1502 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001503 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 (b' ',
1505 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001506 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001507 (b'\xc2\xa0',
1508 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001510 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001511 None),
1512 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001513 (b'\xe2\x80\x80',
1514 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001515 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001516 (b'\xe2\x80\x8b',
1517 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001518 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001519 (b'\xe3\x80\x80',
1520 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001521 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001522 (b'\x10\x7f',
1523 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001524 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001525 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001526 None),
1527 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001528 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001529 None),
1530 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001531 (b'\xef\xbb\xbf',
1532 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001533 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001534 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001535 None),
1536 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001537 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001538 None),
1539 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001540 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001541 None),
1542 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001543 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001544 None),
1545 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001546 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001547 None),
1548 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001549 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001550 None),
1551 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001552 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001553 None),
1554 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001555 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001556 None),
1557 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001558 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001559 None),
1560 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001561 (b'\xcd\x81',
1562 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001563 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001564 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001565 None),
1566 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001567 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001568 None),
1569 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001570 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001571 None),
1572 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001573 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001574 None),
1575 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001577 None),
1578 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001579 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001580 None),
1581 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001582 (b'foo\xef\xb9\xb6bar',
1583 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001584 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001585 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001586 None),
1587 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001588 (b'\xd8\xa71\xd8\xa8',
1589 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001590 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001591 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001592 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001593 # None),
1594 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001595 # 3.44 Larger test (shrinking).
1596 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001597 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1598 b'\xaa\xce\xb0\xe2\x80\x80',
1599 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001600 # 3.45 Larger test (expanding).
1601 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001602 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1603 b'\x80',
1604 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1605 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1606 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001607 ]
1608
1609
1610class NameprepTest(unittest.TestCase):
1611 def test_nameprep(self):
1612 from encodings.idna import nameprep
1613 for pos, (orig, prepped) in enumerate(nameprep_tests):
1614 if orig is None:
1615 # Skipped
1616 continue
1617 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001618 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001619 if prepped is None:
1620 # Input contains prohibited characters
1621 self.assertRaises(UnicodeError, nameprep, orig)
1622 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001623 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001624 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001625 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001626 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001627 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001628
Victor Stinnerf96418d2015-09-21 23:06:27 +02001629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001630class IDNACodecTest(unittest.TestCase):
1631 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(str(b"python.org", "idna"), "python.org")
1633 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1634 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1635 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001636
1637 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001638 self.assertEqual("python.org".encode("idna"), b"python.org")
1639 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1640 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1641 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001642
Martin v. Löwis8b595142005-08-25 11:03:38 +00001643 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001644 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001645 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001646 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001647
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001648 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001649 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001650 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001651 "python.org"
1652 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001653 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001654 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001655 "python.org."
1656 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001657 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001658 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001659 "pyth\xf6n.org."
1660 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001661 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001662 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001663 "pyth\xf6n.org."
1664 )
1665
1666 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001667 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1668 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1669 self.assertEqual(decoder.decode(b"rg"), "")
1670 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001671
1672 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001673 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1674 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1675 self.assertEqual(decoder.decode(b"rg."), "org.")
1676 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001677
1678 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001679 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001680 b"".join(codecs.iterencode("python.org", "idna")),
1681 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001682 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001683 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001684 b"".join(codecs.iterencode("python.org.", "idna")),
1685 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001686 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001687 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001688 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1689 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001690 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001691 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001692 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1693 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001694 )
1695
1696 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001697 self.assertEqual(encoder.encode("\xe4x"), b"")
1698 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1699 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001700
1701 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001702 self.assertEqual(encoder.encode("\xe4x"), b"")
1703 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1704 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001705
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001706 def test_errors(self):
1707 """Only supports "strict" error handler"""
1708 "python.org".encode("idna", "strict")
1709 b"python.org".decode("idna", "strict")
1710 for errors in ("ignore", "replace", "backslashreplace",
1711 "surrogateescape"):
1712 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1713 self.assertRaises(Exception,
1714 b"python.org".decode, "idna", errors)
1715
Victor Stinnerf96418d2015-09-21 23:06:27 +02001716
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001717class CodecsModuleTest(unittest.TestCase):
1718
1719 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001720 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1721 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001722 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001723 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001724 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001725
Victor Stinnera57dfd02014-05-14 17:13:14 +02001726 # test keywords
1727 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1728 '\xe4\xf6\xfc')
1729 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1730 '[]')
1731
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001732 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001733 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1734 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001735 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001736 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001737 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001738 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001739
Victor Stinnera57dfd02014-05-14 17:13:14 +02001740 # test keywords
1741 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1742 b'\xe4\xf6\xfc')
1743 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1744 b'[]')
1745
Walter Dörwald063e1e82004-10-28 13:04:26 +00001746 def test_register(self):
1747 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001748 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001749
1750 def test_lookup(self):
1751 self.assertRaises(TypeError, codecs.lookup)
1752 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001753 self.assertRaises(LookupError, codecs.lookup, " ")
1754
1755 def test_getencoder(self):
1756 self.assertRaises(TypeError, codecs.getencoder)
1757 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1758
1759 def test_getdecoder(self):
1760 self.assertRaises(TypeError, codecs.getdecoder)
1761 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1762
1763 def test_getreader(self):
1764 self.assertRaises(TypeError, codecs.getreader)
1765 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1766
1767 def test_getwriter(self):
1768 self.assertRaises(TypeError, codecs.getwriter)
1769 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001770
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001771 def test_lookup_issue1813(self):
1772 # Issue #1813: under Turkish locales, lookup of some codecs failed
1773 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001774 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001775 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1776 try:
1777 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1778 except locale.Error:
1779 # Unsupported locale on this system
1780 self.skipTest('test needs Turkish locale')
1781 c = codecs.lookup('ASCII')
1782 self.assertEqual(c.name, 'ascii')
1783
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001784 def test_all(self):
1785 api = (
1786 "encode", "decode",
1787 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1788 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1789 "getencoder", "getdecoder", "getincrementalencoder",
1790 "getincrementaldecoder", "getreader", "getwriter",
1791 "register_error", "lookup_error",
1792 "strict_errors", "replace_errors", "ignore_errors",
1793 "xmlcharrefreplace_errors", "backslashreplace_errors",
1794 "namereplace_errors",
1795 "open", "EncodedFile",
1796 "iterencode", "iterdecode",
1797 "BOM", "BOM_BE", "BOM_LE",
1798 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1799 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1800 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1801 "StreamReaderWriter", "StreamRecoder",
1802 )
1803 self.assertCountEqual(api, codecs.__all__)
1804 for api in codecs.__all__:
1805 getattr(codecs, api)
1806
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001807 def test_open(self):
1808 self.addCleanup(support.unlink, support.TESTFN)
1809 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1810 with self.subTest(mode), \
1811 codecs.open(support.TESTFN, mode, 'ascii') as file:
1812 self.assertIsInstance(file, codecs.StreamReaderWriter)
1813
1814 def test_undefined(self):
1815 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1816 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1817 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1818 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1819 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1820 self.assertRaises(UnicodeError,
1821 codecs.encode, 'abc', 'undefined', errors)
1822 self.assertRaises(UnicodeError,
1823 codecs.decode, b'abc', 'undefined', errors)
1824
Victor Stinnerf96418d2015-09-21 23:06:27 +02001825
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001826class StreamReaderTest(unittest.TestCase):
1827
1828 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001829 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001830 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001831
1832 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001833 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001834 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001835
Victor Stinnerf96418d2015-09-21 23:06:27 +02001836
Thomas Wouters89f507f2006-12-13 04:49:30 +00001837class EncodedFileTest(unittest.TestCase):
1838
1839 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001840 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001841 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001842 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001843
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001844 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001845 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001846 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001847 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001848
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001849all_unicode_encodings = [
1850 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001851 "big5",
1852 "big5hkscs",
1853 "charmap",
1854 "cp037",
1855 "cp1006",
1856 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001857 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858 "cp1140",
1859 "cp1250",
1860 "cp1251",
1861 "cp1252",
1862 "cp1253",
1863 "cp1254",
1864 "cp1255",
1865 "cp1256",
1866 "cp1257",
1867 "cp1258",
1868 "cp424",
1869 "cp437",
1870 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001871 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001872 "cp737",
1873 "cp775",
1874 "cp850",
1875 "cp852",
1876 "cp855",
1877 "cp856",
1878 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001879 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001880 "cp860",
1881 "cp861",
1882 "cp862",
1883 "cp863",
1884 "cp864",
1885 "cp865",
1886 "cp866",
1887 "cp869",
1888 "cp874",
1889 "cp875",
1890 "cp932",
1891 "cp949",
1892 "cp950",
1893 "euc_jis_2004",
1894 "euc_jisx0213",
1895 "euc_jp",
1896 "euc_kr",
1897 "gb18030",
1898 "gb2312",
1899 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001900 "hp_roman8",
1901 "hz",
1902 "idna",
1903 "iso2022_jp",
1904 "iso2022_jp_1",
1905 "iso2022_jp_2",
1906 "iso2022_jp_2004",
1907 "iso2022_jp_3",
1908 "iso2022_jp_ext",
1909 "iso2022_kr",
1910 "iso8859_1",
1911 "iso8859_10",
1912 "iso8859_11",
1913 "iso8859_13",
1914 "iso8859_14",
1915 "iso8859_15",
1916 "iso8859_16",
1917 "iso8859_2",
1918 "iso8859_3",
1919 "iso8859_4",
1920 "iso8859_5",
1921 "iso8859_6",
1922 "iso8859_7",
1923 "iso8859_8",
1924 "iso8859_9",
1925 "johab",
1926 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001927 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001928 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001929 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001930 "latin_1",
1931 "mac_cyrillic",
1932 "mac_greek",
1933 "mac_iceland",
1934 "mac_latin2",
1935 "mac_roman",
1936 "mac_turkish",
1937 "palmos",
1938 "ptcp154",
1939 "punycode",
1940 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001941 "shift_jis",
1942 "shift_jis_2004",
1943 "shift_jisx0213",
1944 "tis_620",
1945 "unicode_escape",
1946 "unicode_internal",
1947 "utf_16",
1948 "utf_16_be",
1949 "utf_16_le",
1950 "utf_7",
1951 "utf_8",
1952]
1953
1954if hasattr(codecs, "mbcs_encode"):
1955 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001956if hasattr(codecs, "oem_encode"):
1957 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001958
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001959# The following encoding is not tested, because it's not supposed
1960# to work:
1961# "undefined"
1962
1963# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001964broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001965 "punycode",
1966 "unicode_internal"
1967]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001968
Victor Stinnerf96418d2015-09-21 23:06:27 +02001969
Walter Dörwald3abcb012007-04-16 22:10:50 +00001970class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001971 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001972 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001973 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001974 name = codecs.lookup(encoding).name
1975 if encoding.endswith("_codec"):
1976 name += "_codec"
1977 elif encoding == "latin_1":
1978 name = "latin_1"
1979 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001980
Ezio Melottiadc417c2011-11-17 12:23:34 +02001981 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001982 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001983 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001984 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001985 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001986 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001987
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001988 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001989 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001990 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001991 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001992 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001993 for c in s:
1994 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001995 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001996 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001997 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001998 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001999 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002000 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002001 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002002 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002003 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002004 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002005
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002006 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002007 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00002008 try:
2009 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002010 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00002011 pass
2012 else:
2013 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002014 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002015 for c in s:
2016 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002017 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002018 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002019 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002020 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002021 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002022 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002023 self.assertEqual(decodedresult, s,
2024 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002025
2026 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002027 result = "".join(codecs.iterdecode(
2028 codecs.iterencode(s, encoding), encoding))
2029 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002030
2031 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002032 result = "".join(codecs.iterdecode(
2033 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002034 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002035
Victor Stinner554f3f02010-06-16 23:33:54 +00002036 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002037 # check incremental decoder/encoder with errors argument
2038 try:
2039 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002040 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002041 pass
2042 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002043 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002044 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002045 decodedresult = "".join(decoder.decode(bytes([c]))
2046 for c in encodedresult)
2047 self.assertEqual(decodedresult, s,
2048 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002049
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002050 @support.cpython_only
2051 def test_basics_capi(self):
2052 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2053 s = "abc123" # all codecs should be able to encode these
2054 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002055 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002056 # check incremental decoder/encoder (fetched via the C API)
2057 try:
2058 cencoder = codec_incrementalencoder(encoding)
2059 except LookupError: # no IncrementalEncoder
2060 pass
2061 else:
2062 # check C API
2063 encodedresult = b""
2064 for c in s:
2065 encodedresult += cencoder.encode(c)
2066 encodedresult += cencoder.encode("", True)
2067 cdecoder = codec_incrementaldecoder(encoding)
2068 decodedresult = ""
2069 for c in encodedresult:
2070 decodedresult += cdecoder.decode(bytes([c]))
2071 decodedresult += cdecoder.decode(b"", True)
2072 self.assertEqual(decodedresult, s,
2073 "encoding=%r" % encoding)
2074
2075 if encoding not in ("idna", "mbcs"):
2076 # check incremental decoder/encoder with errors argument
2077 try:
2078 cencoder = codec_incrementalencoder(encoding, "ignore")
2079 except LookupError: # no IncrementalEncoder
2080 pass
2081 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002082 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002083 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2084 decodedresult = "".join(cdecoder.decode(bytes([c]))
2085 for c in encodedresult)
2086 self.assertEqual(decodedresult, s,
2087 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002088
Walter Dörwald729c31f2005-03-14 19:06:30 +00002089 def test_seek(self):
2090 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002091 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002092 for encoding in all_unicode_encodings:
2093 if encoding == "idna": # FIXME: See SF bug #1163178
2094 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002095 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002096 continue
Victor Stinner05010702011-05-27 16:50:40 +02002097 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002098 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002099 # Test that calling seek resets the internal codec state and buffers
2100 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002101 data = reader.read()
2102 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002103
Walter Dörwalde22d3392005-11-17 08:52:34 +00002104 def test_bad_decode_args(self):
2105 for encoding in all_unicode_encodings:
2106 decoder = codecs.getdecoder(encoding)
2107 self.assertRaises(TypeError, decoder)
2108 if encoding not in ("idna", "punycode"):
2109 self.assertRaises(TypeError, decoder, 42)
2110
2111 def test_bad_encode_args(self):
2112 for encoding in all_unicode_encodings:
2113 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002114 with support.check_warnings():
2115 # unicode-internal has been deprecated
2116 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002117
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002118 def test_encoding_map_type_initialized(self):
2119 from encodings import cp1140
2120 # This used to crash, we are only verifying there's no crash.
2121 table_type = type(cp1140.encoding_table)
2122 self.assertEqual(table_type, table_type)
2123
Walter Dörwald3abcb012007-04-16 22:10:50 +00002124 def test_decoder_state(self):
2125 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002126 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002127 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002128 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002129 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2130 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2131
Victor Stinnerf96418d2015-09-21 23:06:27 +02002132
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002133class CharmapTest(unittest.TestCase):
2134 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002135 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002136 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002137 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002138 )
2139
Ezio Melottib3aedd42010-11-20 19:04:17 +00002140 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002141 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2142 ("\U0010FFFFbc", 3)
2143 )
2144
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002145 self.assertRaises(UnicodeDecodeError,
2146 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2147 )
2148
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002149 self.assertRaises(UnicodeDecodeError,
2150 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2151 )
2152
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002153 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002154 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002155 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002156 )
2157
Ezio Melottib3aedd42010-11-20 19:04:17 +00002158 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002159 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002160 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002161 )
2162
Ezio Melottib3aedd42010-11-20 19:04:17 +00002163 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002164 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2165 ("ab\\x02", 3)
2166 )
2167
2168 self.assertEqual(
2169 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2170 ("ab\\x02", 3)
2171 )
2172
2173 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002174 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002175 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002176 )
2177
Ezio Melottib3aedd42010-11-20 19:04:17 +00002178 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002179 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002180 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002181 )
2182
Guido van Rossum805365e2007-05-07 22:24:25 +00002183 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002184 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002185 codecs.charmap_decode(allbytes, "ignore", ""),
2186 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002187 )
2188
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002189 def test_decode_with_int2str_map(self):
2190 self.assertEqual(
2191 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2192 {0: 'a', 1: 'b', 2: 'c'}),
2193 ("abc", 3)
2194 )
2195
2196 self.assertEqual(
2197 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2198 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2199 ("AaBbCc", 3)
2200 )
2201
2202 self.assertEqual(
2203 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2204 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2205 ("\U0010FFFFbc", 3)
2206 )
2207
2208 self.assertEqual(
2209 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2210 {0: 'a', 1: 'b', 2: ''}),
2211 ("ab", 3)
2212 )
2213
2214 self.assertRaises(UnicodeDecodeError,
2215 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2216 {0: 'a', 1: 'b'}
2217 )
2218
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002219 self.assertRaises(UnicodeDecodeError,
2220 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2221 {0: 'a', 1: 'b', 2: None}
2222 )
2223
2224 # Issue #14850
2225 self.assertRaises(UnicodeDecodeError,
2226 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2227 {0: 'a', 1: 'b', 2: '\ufffe'}
2228 )
2229
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002230 self.assertEqual(
2231 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2232 {0: 'a', 1: 'b'}),
2233 ("ab\ufffd", 3)
2234 )
2235
2236 self.assertEqual(
2237 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2238 {0: 'a', 1: 'b', 2: None}),
2239 ("ab\ufffd", 3)
2240 )
2241
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002242 # Issue #14850
2243 self.assertEqual(
2244 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2245 {0: 'a', 1: 'b', 2: '\ufffe'}),
2246 ("ab\ufffd", 3)
2247 )
2248
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002249 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002250 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2251 {0: 'a', 1: 'b'}),
2252 ("ab\\x02", 3)
2253 )
2254
2255 self.assertEqual(
2256 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2257 {0: 'a', 1: 'b', 2: None}),
2258 ("ab\\x02", 3)
2259 )
2260
2261 # Issue #14850
2262 self.assertEqual(
2263 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2264 {0: 'a', 1: 'b', 2: '\ufffe'}),
2265 ("ab\\x02", 3)
2266 )
2267
2268 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002269 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2270 {0: 'a', 1: 'b'}),
2271 ("ab", 3)
2272 )
2273
2274 self.assertEqual(
2275 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2276 {0: 'a', 1: 'b', 2: None}),
2277 ("ab", 3)
2278 )
2279
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002280 # Issue #14850
2281 self.assertEqual(
2282 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2283 {0: 'a', 1: 'b', 2: '\ufffe'}),
2284 ("ab", 3)
2285 )
2286
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002287 allbytes = bytes(range(256))
2288 self.assertEqual(
2289 codecs.charmap_decode(allbytes, "ignore", {}),
2290 ("", len(allbytes))
2291 )
2292
2293 def test_decode_with_int2int_map(self):
2294 a = ord('a')
2295 b = ord('b')
2296 c = ord('c')
2297
2298 self.assertEqual(
2299 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2300 {0: a, 1: b, 2: c}),
2301 ("abc", 3)
2302 )
2303
2304 # Issue #15379
2305 self.assertEqual(
2306 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2307 {0: 0x10FFFF, 1: b, 2: c}),
2308 ("\U0010FFFFbc", 3)
2309 )
2310
Antoine Pitroua1f76552012-09-23 20:00:04 +02002311 self.assertEqual(
2312 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2313 {0: sys.maxunicode, 1: b, 2: c}),
2314 (chr(sys.maxunicode) + "bc", 3)
2315 )
2316
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002317 self.assertRaises(TypeError,
2318 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002319 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002320 )
2321
2322 self.assertRaises(UnicodeDecodeError,
2323 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2324 {0: a, 1: b},
2325 )
2326
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002327 self.assertRaises(UnicodeDecodeError,
2328 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2329 {0: a, 1: b, 2: 0xFFFE},
2330 )
2331
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002332 self.assertEqual(
2333 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2334 {0: a, 1: b}),
2335 ("ab\ufffd", 3)
2336 )
2337
2338 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002339 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2340 {0: a, 1: b, 2: 0xFFFE}),
2341 ("ab\ufffd", 3)
2342 )
2343
2344 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002345 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2346 {0: a, 1: b}),
2347 ("ab\\x02", 3)
2348 )
2349
2350 self.assertEqual(
2351 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2352 {0: a, 1: b, 2: 0xFFFE}),
2353 ("ab\\x02", 3)
2354 )
2355
2356 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002357 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2358 {0: a, 1: b}),
2359 ("ab", 3)
2360 )
2361
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002362 self.assertEqual(
2363 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2364 {0: a, 1: b, 2: 0xFFFE}),
2365 ("ab", 3)
2366 )
2367
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002368
Thomas Wouters89f507f2006-12-13 04:49:30 +00002369class WithStmtTest(unittest.TestCase):
2370 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002371 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002372 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2373 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002374 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002375
2376 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002377 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002378 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002379 with codecs.StreamReaderWriter(f, info.streamreader,
2380 info.streamwriter, 'strict') as srw:
2381 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002382
Victor Stinnerf96418d2015-09-21 23:06:27 +02002383
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002384class TypesTest(unittest.TestCase):
2385 def test_decode_unicode(self):
2386 # Most decoders don't accept unicode input
2387 decoders = [
2388 codecs.utf_7_decode,
2389 codecs.utf_8_decode,
2390 codecs.utf_16_le_decode,
2391 codecs.utf_16_be_decode,
2392 codecs.utf_16_ex_decode,
2393 codecs.utf_32_decode,
2394 codecs.utf_32_le_decode,
2395 codecs.utf_32_be_decode,
2396 codecs.utf_32_ex_decode,
2397 codecs.latin_1_decode,
2398 codecs.ascii_decode,
2399 codecs.charmap_decode,
2400 ]
2401 if hasattr(codecs, "mbcs_decode"):
2402 decoders.append(codecs.mbcs_decode)
2403 for decoder in decoders:
2404 self.assertRaises(TypeError, decoder, "xxx")
2405
2406 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002407 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002408 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002409 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2410 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2411 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2412 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002413
Victor Stinnere3b47152011-12-09 20:49:49 +01002414 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2415 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002416 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2417 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002418
2419 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2420 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002421 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2422 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002423
Serhiy Storchakad6793772013-01-29 10:20:44 +02002424
2425class UnicodeEscapeTest(unittest.TestCase):
2426 def test_empty(self):
2427 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2428 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2429
2430 def test_raw_encode(self):
2431 encode = codecs.unicode_escape_encode
2432 for b in range(32, 127):
2433 if b != b'\\'[0]:
2434 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2435
2436 def test_raw_decode(self):
2437 decode = codecs.unicode_escape_decode
2438 for b in range(256):
2439 if b != b'\\'[0]:
2440 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2441
2442 def test_escape_encode(self):
2443 encode = codecs.unicode_escape_encode
2444 check = coding_checker(self, encode)
2445 check('\t', br'\t')
2446 check('\n', br'\n')
2447 check('\r', br'\r')
2448 check('\\', br'\\')
2449 for b in range(32):
2450 if chr(b) not in '\t\n\r':
2451 check(chr(b), ('\\x%02x' % b).encode())
2452 for b in range(127, 256):
2453 check(chr(b), ('\\x%02x' % b).encode())
2454 check('\u20ac', br'\u20ac')
2455 check('\U0001d120', br'\U0001d120')
2456
2457 def test_escape_decode(self):
2458 decode = codecs.unicode_escape_decode
2459 check = coding_checker(self, decode)
2460 check(b"[\\\n]", "[]")
2461 check(br'[\"]', '["]')
2462 check(br"[\']", "[']")
2463 check(br"[\\]", r"[\]")
2464 check(br"[\a]", "[\x07]")
2465 check(br"[\b]", "[\x08]")
2466 check(br"[\t]", "[\x09]")
2467 check(br"[\n]", "[\x0a]")
2468 check(br"[\v]", "[\x0b]")
2469 check(br"[\f]", "[\x0c]")
2470 check(br"[\r]", "[\x0d]")
2471 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002472 check(br"[\78]", "[\x078]")
2473 check(br"[\41]", "[!]")
2474 check(br"[\418]", "[!8]")
2475 check(br"[\101]", "[A]")
2476 check(br"[\1010]", "[A0]")
2477 check(br"[\x41]", "[A]")
2478 check(br"[\x410]", "[A0]")
2479 check(br"\u20ac", "\u20ac")
2480 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002481 for i in range(97, 123):
2482 b = bytes([i])
2483 if b not in b'abfnrtuvx':
2484 with self.assertWarns(DeprecationWarning):
2485 check(b"\\" + b, "\\" + chr(i))
2486 if b.upper() not in b'UN':
2487 with self.assertWarns(DeprecationWarning):
2488 check(b"\\" + b.upper(), "\\" + chr(i-32))
2489 with self.assertWarns(DeprecationWarning):
2490 check(br"\8", "\\8")
2491 with self.assertWarns(DeprecationWarning):
2492 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002493 with self.assertWarns(DeprecationWarning):
2494 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002495
2496 def test_decode_errors(self):
2497 decode = codecs.unicode_escape_decode
2498 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2499 for i in range(d):
2500 self.assertRaises(UnicodeDecodeError, decode,
2501 b"\\" + c + b"0"*i)
2502 self.assertRaises(UnicodeDecodeError, decode,
2503 b"[\\" + c + b"0"*i + b"]")
2504 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2505 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2506 self.assertEqual(decode(data, "replace"),
2507 ("[\ufffd]\ufffd", len(data)))
2508 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2509 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2510 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2511
2512
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002513class RawUnicodeEscapeTest(unittest.TestCase):
2514 def test_empty(self):
2515 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2516 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2517
2518 def test_raw_encode(self):
2519 encode = codecs.raw_unicode_escape_encode
2520 for b in range(256):
2521 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2522
2523 def test_raw_decode(self):
2524 decode = codecs.raw_unicode_escape_decode
2525 for b in range(256):
2526 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2527
2528 def test_escape_encode(self):
2529 encode = codecs.raw_unicode_escape_encode
2530 check = coding_checker(self, encode)
2531 for b in range(256):
2532 if b not in b'uU':
2533 check('\\' + chr(b), b'\\' + bytes([b]))
2534 check('\u20ac', br'\u20ac')
2535 check('\U0001d120', br'\U0001d120')
2536
2537 def test_escape_decode(self):
2538 decode = codecs.raw_unicode_escape_decode
2539 check = coding_checker(self, decode)
2540 for b in range(256):
2541 if b not in b'uU':
2542 check(b'\\' + bytes([b]), '\\' + chr(b))
2543 check(br"\u20ac", "\u20ac")
2544 check(br"\U0001d120", "\U0001d120")
2545
2546 def test_decode_errors(self):
2547 decode = codecs.raw_unicode_escape_decode
2548 for c, d in (b'u', 4), (b'U', 4):
2549 for i in range(d):
2550 self.assertRaises(UnicodeDecodeError, decode,
2551 b"\\" + c + b"0"*i)
2552 self.assertRaises(UnicodeDecodeError, decode,
2553 b"[\\" + c + b"0"*i + b"]")
2554 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2555 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2556 self.assertEqual(decode(data, "replace"),
2557 ("[\ufffd]\ufffd", len(data)))
2558 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2559 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2560 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2561
2562
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002563class EscapeEncodeTest(unittest.TestCase):
2564
2565 def test_escape_encode(self):
2566 tests = [
2567 (b'', (b'', 0)),
2568 (b'foobar', (b'foobar', 6)),
2569 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2570 (b'a\'b', (b"a\\'b", 3)),
2571 (b'b\\c', (b'b\\\\c', 3)),
2572 (b'c\nd', (b'c\\nd', 3)),
2573 (b'd\re', (b'd\\re', 3)),
2574 (b'f\x7fg', (b'f\\x7fg', 3)),
2575 ]
2576 for data, output in tests:
2577 with self.subTest(data=data):
2578 self.assertEqual(codecs.escape_encode(data), output)
2579 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2580 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2581
2582
Martin v. Löwis43c57782009-05-10 08:15:24 +00002583class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002584
2585 def test_utf8(self):
2586 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002587 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002588 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002589 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002590 b"foo\x80bar")
2591 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002592 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002593 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002594 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002595 b"\xed\xb0\x80")
2596
2597 def test_ascii(self):
2598 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002599 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002600 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002601 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002602 b"foo\x80bar")
2603
2604 def test_charmap(self):
2605 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002606 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002607 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002608 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002609 b"foo\xa5bar")
2610
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002611 def test_latin1(self):
2612 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002613 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002614 b"\xe4\xeb\xef\xf6\xfc")
2615
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002616
Victor Stinner3fed0872010-05-22 02:16:27 +00002617class BomTest(unittest.TestCase):
2618 def test_seek0(self):
2619 data = "1234567890"
2620 tests = ("utf-16",
2621 "utf-16-le",
2622 "utf-16-be",
2623 "utf-32",
2624 "utf-32-le",
2625 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002626 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002627 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002628 # Check if the BOM is written only once
2629 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002630 f.write(data)
2631 f.write(data)
2632 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002633 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002634 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002635 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002636
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002637 # Check that the BOM is written after a seek(0)
2638 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2639 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002640 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002641 f.seek(0)
2642 f.write(data)
2643 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002644 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002645
2646 # (StreamWriter) Check that the BOM is written after a seek(0)
2647 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002648 f.writer.write(data[0])
2649 self.assertNotEqual(f.writer.tell(), 0)
2650 f.writer.seek(0)
2651 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002652 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002653 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002654
Victor Stinner05010702011-05-27 16:50:40 +02002655 # Check that the BOM is not written after a seek() at a position
2656 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002657 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2658 f.write(data)
2659 f.seek(f.tell())
2660 f.write(data)
2661 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002662 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002663
Victor Stinner05010702011-05-27 16:50:40 +02002664 # (StreamWriter) Check that the BOM is not written after a seek()
2665 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002666 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002667 f.writer.write(data)
2668 f.writer.seek(f.writer.tell())
2669 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002670 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002671 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002672
Victor Stinner3fed0872010-05-22 02:16:27 +00002673
Georg Brandl02524622010-12-02 18:06:51 +00002674bytes_transform_encodings = [
2675 "base64_codec",
2676 "uu_codec",
2677 "quopri_codec",
2678 "hex_codec",
2679]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002680
2681transform_aliases = {
2682 "base64_codec": ["base64", "base_64"],
2683 "uu_codec": ["uu"],
2684 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2685 "hex_codec": ["hex"],
2686 "rot_13": ["rot13"],
2687}
2688
Georg Brandl02524622010-12-02 18:06:51 +00002689try:
2690 import zlib
2691except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002692 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002693else:
2694 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002695 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002696try:
2697 import bz2
2698except ImportError:
2699 pass
2700else:
2701 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002702 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002703
Victor Stinnerf96418d2015-09-21 23:06:27 +02002704
Georg Brandl02524622010-12-02 18:06:51 +00002705class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002706
Georg Brandl02524622010-12-02 18:06:51 +00002707 def test_basics(self):
2708 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002709 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002710 with self.subTest(encoding=encoding):
2711 # generic codecs interface
2712 (o, size) = codecs.getencoder(encoding)(binput)
2713 self.assertEqual(size, len(binput))
2714 (i, size) = codecs.getdecoder(encoding)(o)
2715 self.assertEqual(size, len(o))
2716 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002717
Georg Brandl02524622010-12-02 18:06:51 +00002718 def test_read(self):
2719 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002720 with self.subTest(encoding=encoding):
2721 sin = codecs.encode(b"\x80", encoding)
2722 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2723 sout = reader.read()
2724 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002725
2726 def test_readline(self):
2727 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002728 with self.subTest(encoding=encoding):
2729 sin = codecs.encode(b"\x80", encoding)
2730 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2731 sout = reader.readline()
2732 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002733
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002734 def test_buffer_api_usage(self):
2735 # We check all the transform codecs accept memoryview input
2736 # for encoding and decoding
2737 # and also that they roundtrip correctly
2738 original = b"12345\x80"
2739 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002740 with self.subTest(encoding=encoding):
2741 data = original
2742 view = memoryview(data)
2743 data = codecs.encode(data, encoding)
2744 view_encoded = codecs.encode(view, encoding)
2745 self.assertEqual(view_encoded, data)
2746 view = memoryview(data)
2747 data = codecs.decode(data, encoding)
2748 self.assertEqual(data, original)
2749 view_decoded = codecs.decode(view, encoding)
2750 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002751
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002752 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002753 # Check binary -> binary codecs give a good error for str input
2754 bad_input = "bad input type"
2755 for encoding in bytes_transform_encodings:
2756 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002757 fmt = (r"{!r} is not a text encoding; "
2758 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002759 msg = fmt.format(encoding)
2760 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002761 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002762 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002763
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 def test_text_to_binary_blacklists_text_transforms(self):
2765 # Check str.encode gives a good error message for str -> str codecs
2766 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002767 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002768 with self.assertRaisesRegex(LookupError, msg):
2769 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002770
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002771 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002772 # Check bytes.decode and bytearray.decode give a good error
2773 # message for binary -> binary codecs
2774 data = b"encode first to ensure we meet any format restrictions"
2775 for encoding in bytes_transform_encodings:
2776 with self.subTest(encoding=encoding):
2777 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002778 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002779 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002780 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002781 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002782 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002784 bytearray(encoded_data).decode(encoding)
2785
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002786 def test_binary_to_text_blacklists_text_transforms(self):
2787 # Check str -> str codec gives a good error for binary input
2788 for bad_input in (b"immutable", bytearray(b"mutable")):
2789 with self.subTest(bad_input=bad_input):
2790 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002791 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002792 with self.assertRaisesRegex(LookupError, msg) as failure:
2793 bad_input.decode("rot_13")
2794 self.assertIsNone(failure.exception.__cause__)
2795
Zachary Wareefa2e042013-12-30 14:54:11 -06002796 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002797 def test_custom_zlib_error_is_wrapped(self):
2798 # Check zlib codec gives a good error for malformed input
2799 msg = "^decoding with 'zlib_codec' codec failed"
2800 with self.assertRaisesRegex(Exception, msg) as failure:
2801 codecs.decode(b"hello", "zlib_codec")
2802 self.assertIsInstance(failure.exception.__cause__,
2803 type(failure.exception))
2804
2805 def test_custom_hex_error_is_wrapped(self):
2806 # Check hex codec gives a good error for malformed input
2807 msg = "^decoding with 'hex_codec' codec failed"
2808 with self.assertRaisesRegex(Exception, msg) as failure:
2809 codecs.decode(b"hello", "hex_codec")
2810 self.assertIsInstance(failure.exception.__cause__,
2811 type(failure.exception))
2812
2813 # Unfortunately, the bz2 module throws OSError, which the codec
2814 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002815
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002816 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2817 def test_aliases(self):
2818 for codec_name, aliases in transform_aliases.items():
2819 expected_name = codecs.lookup(codec_name).name
2820 for alias in aliases:
2821 with self.subTest(alias=alias):
2822 info = codecs.lookup(alias)
2823 self.assertEqual(info.name, expected_name)
2824
Martin Panter06171bd2015-09-12 00:34:28 +00002825 def test_quopri_stateless(self):
2826 # Should encode with quotetabs=True
2827 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2828 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2829 # But should still support unescaped tabs and spaces
2830 unescaped = b"space tab eol\n"
2831 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2832
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002833 def test_uu_invalid(self):
2834 # Missing "begin" line
2835 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2836
Nick Coghlan8b097b42013-11-13 23:49:21 +10002837
2838# The codec system tries to wrap exceptions in order to ensure the error
2839# mentions the operation being performed and the codec involved. We
2840# currently *only* want this to happen for relatively stateless
2841# exceptions, where the only significant information they contain is their
2842# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002843
2844# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002845# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002846_TEST_CODECS = {}
2847
2848def _get_test_codec(codec_name):
2849 return _TEST_CODECS.get(codec_name)
2850codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2851
Nick Coghlan8fad1672014-09-15 23:50:44 +12002852try:
2853 # Issue #22166: Also need to clear the internal cache in CPython
2854 from _codecs import _forget_codec
2855except ImportError:
2856 def _forget_codec(codec_name):
2857 pass
2858
2859
Nick Coghlan8b097b42013-11-13 23:49:21 +10002860class ExceptionChainingTest(unittest.TestCase):
2861
2862 def setUp(self):
2863 # There's no way to unregister a codec search function, so we just
2864 # ensure we render this one fairly harmless after the test
2865 # case finishes by using the test case repr as the codec name
2866 # The codecs module normalizes codec names, although this doesn't
2867 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002868 # We also make sure we use a truly unique id for the custom codec
2869 # to avoid issues with the codec cache when running these tests
2870 # multiple times (e.g. when hunting for refleaks)
2871 unique_id = repr(self) + str(id(self))
2872 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2873
2874 # We store the object to raise on the instance because of a bad
2875 # interaction between the codec caching (which means we can't
2876 # recreate the codec entry) and regrtest refleak hunting (which
2877 # runs the same test instance multiple times). This means we
2878 # need to ensure the codecs call back in to the instance to find
2879 # out which exception to raise rather than binding them in a
2880 # closure to an object that may change on the next run
2881 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002882
Nick Coghlan4e553e22013-11-16 00:35:34 +10002883 def tearDown(self):
2884 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002885 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2886 encodings._cache.pop(self.codec_name, None)
2887 try:
2888 _forget_codec(self.codec_name)
2889 except KeyError:
2890 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002891
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002892 def set_codec(self, encode, decode):
2893 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002894 name=self.codec_name)
2895 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002896
2897 @contextlib.contextmanager
2898 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002899 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002900 operation, self.codec_name, exc_type.__name__, msg)
2901 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2902 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002903 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002904 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002905
2906 def raise_obj(self, *args, **kwds):
2907 # Helper to dynamically change the object raised by a test codec
2908 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002909
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002910 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002911 self.obj_to_raise = obj_to_raise
2912 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002913 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002914 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002915 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002916 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002917 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002918 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002919 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002920 codecs.decode(b"bytes input", self.codec_name)
2921
2922 def test_raise_by_type(self):
2923 self.check_wrapped(RuntimeError, "")
2924
2925 def test_raise_by_value(self):
2926 msg = "This should be wrapped"
2927 self.check_wrapped(RuntimeError(msg), msg)
2928
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002929 def test_raise_grandchild_subclass_exact_size(self):
2930 msg = "This should be wrapped"
2931 class MyRuntimeError(RuntimeError):
2932 __slots__ = ()
2933 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2934
2935 def test_raise_subclass_with_weakref_support(self):
2936 msg = "This should be wrapped"
2937 class MyRuntimeError(RuntimeError):
2938 pass
2939 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2940
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002941 def check_not_wrapped(self, obj_to_raise, msg):
2942 def raise_obj(*args, **kwds):
2943 raise obj_to_raise
2944 self.set_codec(raise_obj, raise_obj)
2945 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002946 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002947 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002948 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002949 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002950 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002951 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002952 codecs.decode(b"bytes input", self.codec_name)
2953
2954 def test_init_override_is_not_wrapped(self):
2955 class CustomInit(RuntimeError):
2956 def __init__(self):
2957 pass
2958 self.check_not_wrapped(CustomInit, "")
2959
2960 def test_new_override_is_not_wrapped(self):
2961 class CustomNew(RuntimeError):
2962 def __new__(cls):
2963 return super().__new__(cls)
2964 self.check_not_wrapped(CustomNew, "")
2965
2966 def test_instance_attribute_is_not_wrapped(self):
2967 msg = "This should NOT be wrapped"
2968 exc = RuntimeError(msg)
2969 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002970 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002971
2972 def test_non_str_arg_is_not_wrapped(self):
2973 self.check_not_wrapped(RuntimeError(1), "1")
2974
2975 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002976 msg_re = r"^\('a', 'b', 'c'\)$"
2977 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002978
2979 # http://bugs.python.org/issue19609
2980 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002981 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002982 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002983 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002984 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002985 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002986 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002987 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002988 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002989 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002990 codecs.decode(b"bytes input", self.codec_name)
2991
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002992 def test_unflagged_non_text_codec_handling(self):
2993 # The stdlib non-text codecs are now marked so they're
2994 # pre-emptively skipped by the text model related methods
2995 # However, third party codecs won't be flagged, so we still make
2996 # sure the case where an inappropriate output type is produced is
2997 # handled appropriately
2998 def encode_to_str(*args, **kwds):
2999 return "not bytes!", 0
3000 def decode_to_bytes(*args, **kwds):
3001 return b"not str!", 0
3002 self.set_codec(encode_to_str, decode_to_bytes)
3003 # No input or output type checks on the codecs module functions
3004 encoded = codecs.encode(None, self.codec_name)
3005 self.assertEqual(encoded, "not bytes!")
3006 decoded = codecs.decode(None, self.codec_name)
3007 self.assertEqual(decoded, b"not str!")
3008 # Text model methods should complain
3009 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04003010 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003011 msg = fmt.format(self.codec_name)
3012 with self.assertRaisesRegex(TypeError, msg):
3013 "str_input".encode(self.codec_name)
3014 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003015 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003016 msg = fmt.format(self.codec_name)
3017 with self.assertRaisesRegex(TypeError, msg):
3018 b"bytes input".decode(self.codec_name)
3019
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003020
Georg Brandl02524622010-12-02 18:06:51 +00003021
Victor Stinner62be4fb2011-10-18 21:46:37 +02003022@unittest.skipUnless(sys.platform == 'win32',
3023 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003024class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003025 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003026 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003027
Victor Stinner3a50e702011-10-18 21:21:00 +02003028 def test_invalid_code_page(self):
3029 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3030 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003031 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3032 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003033
3034 def test_code_page_name(self):
3035 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3036 codecs.code_page_encode, 932, '\xff')
3037 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003038 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003039 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003040 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003041
3042 def check_decode(self, cp, tests):
3043 for raw, errors, expected in tests:
3044 if expected is not None:
3045 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003046 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003047 except UnicodeDecodeError as err:
3048 self.fail('Unable to decode %a from "cp%s" with '
3049 'errors=%r: %s' % (raw, cp, errors, err))
3050 self.assertEqual(decoded[0], expected,
3051 '%a.decode("cp%s", %r)=%a != %a'
3052 % (raw, cp, errors, decoded[0], expected))
3053 # assert 0 <= decoded[1] <= len(raw)
3054 self.assertGreaterEqual(decoded[1], 0)
3055 self.assertLessEqual(decoded[1], len(raw))
3056 else:
3057 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003058 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003059
3060 def check_encode(self, cp, tests):
3061 for text, errors, expected in tests:
3062 if expected is not None:
3063 try:
3064 encoded = codecs.code_page_encode(cp, text, errors)
3065 except UnicodeEncodeError as err:
3066 self.fail('Unable to encode %a to "cp%s" with '
3067 'errors=%r: %s' % (text, cp, errors, err))
3068 self.assertEqual(encoded[0], expected,
3069 '%a.encode("cp%s", %r)=%a != %a'
3070 % (text, cp, errors, encoded[0], expected))
3071 self.assertEqual(encoded[1], len(text))
3072 else:
3073 self.assertRaises(UnicodeEncodeError,
3074 codecs.code_page_encode, cp, text, errors)
3075
3076 def test_cp932(self):
3077 self.check_encode(932, (
3078 ('abc', 'strict', b'abc'),
3079 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003080 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003081 ('\xff', 'strict', None),
3082 ('[\xff]', 'ignore', b'[]'),
3083 ('[\xff]', 'replace', b'[y]'),
3084 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003085 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003086 ('[\xff]', 'namereplace',
3087 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003088 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003089 ('\udcff', 'strict', None),
3090 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3091 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003092 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003093 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003094 (b'abc', 'strict', 'abc'),
3095 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3096 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003097 (b'[\xff]', 'strict', None),
3098 (b'[\xff]', 'ignore', '[]'),
3099 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003100 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003101 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003102 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003103 (b'\x81\x00abc', 'strict', None),
3104 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003105 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003106 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003107 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003108
3109 def test_cp1252(self):
3110 self.check_encode(1252, (
3111 ('abc', 'strict', b'abc'),
3112 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3113 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003114 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003115 ('\u0141', 'strict', None),
3116 ('\u0141', 'ignore', b''),
3117 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003118 ('\udc98', 'surrogateescape', b'\x98'),
3119 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003120 ))
3121 self.check_decode(1252, (
3122 (b'abc', 'strict', 'abc'),
3123 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3124 (b'\xff', 'strict', '\xff'),
3125 ))
3126
3127 def test_cp_utf7(self):
3128 cp = 65000
3129 self.check_encode(cp, (
3130 ('abc', 'strict', b'abc'),
3131 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3132 ('\U0010ffff', 'strict', b'+2//f/w-'),
3133 ('\udc80', 'strict', b'+3IA-'),
3134 ('\ufffd', 'strict', b'+//0-'),
3135 ))
3136 self.check_decode(cp, (
3137 (b'abc', 'strict', 'abc'),
3138 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3139 (b'+2//f/w-', 'strict', '\U0010ffff'),
3140 (b'+3IA-', 'strict', '\udc80'),
3141 (b'+//0-', 'strict', '\ufffd'),
3142 # invalid bytes
3143 (b'[+/]', 'strict', '[]'),
3144 (b'[\xff]', 'strict', '[\xff]'),
3145 ))
3146
Victor Stinner3a50e702011-10-18 21:21:00 +02003147 def test_multibyte_encoding(self):
3148 self.check_decode(932, (
3149 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3150 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3151 ))
3152 self.check_decode(self.CP_UTF8, (
3153 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3154 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3155 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003156 self.check_encode(self.CP_UTF8, (
3157 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3158 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3159 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003160
3161 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003162 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3163 self.assertEqual(decoded, ('', 0))
3164
Victor Stinner3a50e702011-10-18 21:21:00 +02003165 decoded = codecs.code_page_decode(932,
3166 b'\xe9\x80\xe9', 'strict',
3167 False)
3168 self.assertEqual(decoded, ('\u9a3e', 2))
3169
3170 decoded = codecs.code_page_decode(932,
3171 b'\xe9\x80\xe9\x80', 'strict',
3172 False)
3173 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3174
3175 decoded = codecs.code_page_decode(932,
3176 b'abc', 'strict',
3177 False)
3178 self.assertEqual(decoded, ('abc', 3))
3179
Steve Dowerf5aba582016-09-06 19:42:27 -07003180 def test_mbcs_alias(self):
3181 # Check that looking up our 'default' codepage will return
3182 # mbcs when we don't have a more specific one available
3183 import _bootlocale
3184 def _get_fake_codepage(*a):
3185 return 'cp123'
3186 old_getpreferredencoding = _bootlocale.getpreferredencoding
3187 _bootlocale.getpreferredencoding = _get_fake_codepage
3188 try:
3189 codec = codecs.lookup('cp123')
3190 self.assertEqual(codec.name, 'mbcs')
3191 finally:
3192 _bootlocale.getpreferredencoding = old_getpreferredencoding
3193
Victor Stinner3a50e702011-10-18 21:21:00 +02003194
Victor Stinnerf96418d2015-09-21 23:06:27 +02003195class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003196 def test_encode(self):
3197 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3198
3199 def test_encode_error(self):
3200 for data, error_handler, expected in (
3201 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3202 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3203 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003204 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3205 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003206 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3207 ):
3208 with self.subTest(data=data, error_handler=error_handler,
3209 expected=expected):
3210 self.assertEqual(data.encode('ascii', error_handler),
3211 expected)
3212
3213 def test_encode_surrogateescape_error(self):
3214 with self.assertRaises(UnicodeEncodeError):
3215 # the first character can be decoded, but not the second
3216 '\udc80\xff'.encode('ascii', 'surrogateescape')
3217
Victor Stinnerf96418d2015-09-21 23:06:27 +02003218 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003219 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3220
3221 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003222 for data, error_handler, expected in (
3223 (b'[\x80\xff]', 'ignore', '[]'),
3224 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3225 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3226 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3227 ):
3228 with self.subTest(data=data, error_handler=error_handler,
3229 expected=expected):
3230 self.assertEqual(data.decode('ascii', error_handler),
3231 expected)
3232
3233
Victor Stinnerc3713e92015-09-29 12:32:13 +02003234class Latin1Test(unittest.TestCase):
3235 def test_encode(self):
3236 for data, expected in (
3237 ('abc', b'abc'),
3238 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3239 ):
3240 with self.subTest(data=data, expected=expected):
3241 self.assertEqual(data.encode('latin1'), expected)
3242
3243 def test_encode_errors(self):
3244 for data, error_handler, expected in (
3245 ('[\u20ac\udc80]', 'ignore', b'[]'),
3246 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003247 ('[\u20ac\U000abcde]', 'backslashreplace',
3248 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003249 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3250 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3251 ):
3252 with self.subTest(data=data, error_handler=error_handler,
3253 expected=expected):
3254 self.assertEqual(data.encode('latin1', error_handler),
3255 expected)
3256
3257 def test_encode_surrogateescape_error(self):
3258 with self.assertRaises(UnicodeEncodeError):
3259 # the first character can be decoded, but not the second
3260 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3261
3262 def test_decode(self):
3263 for data, expected in (
3264 (b'abc', 'abc'),
3265 (b'[\x80\xff]', '[\x80\xff]'),
3266 ):
3267 with self.subTest(data=data, expected=expected):
3268 self.assertEqual(data.decode('latin1'), expected)
3269
3270
Fred Drake2e2be372001-09-20 21:33:42 +00003271if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003272 unittest.main()