blob: de6868a46c47467afc8cab86d520b7556166283e [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01008
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Antoine Pitrou00b2c862011-10-05 13:01:41 +020011try:
12 import ctypes
13except ImportError:
14 ctypes = None
15 SIZEOF_WCHAR_T = -1
16else:
17 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000018
Serhiy Storchakad6793772013-01-29 10:20:44 +020019def coding_checker(self, coder):
20 def check(input, expect):
21 self.assertEqual(coder(input), (expect, len(input)))
22 return check
23
Victor Stinnerf96418d2015-09-21 23:06:27 +020024
Walter Dörwald69652032004-09-07 20:24:22 +000025class Queue(object):
26 """
27 queue: write bytes at one end, read bytes from the other end
28 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000029 def __init__(self, buffer):
30 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000031
32 def write(self, chars):
33 self._buffer += chars
34
35 def read(self, size=-1):
36 if size<0:
37 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000038 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000039 return s
40 else:
41 s = self._buffer[:size]
42 self._buffer = self._buffer[size:]
43 return s
44
Victor Stinnerf96418d2015-09-21 23:06:27 +020045
Walter Dörwald3abcb012007-04-16 22:10:50 +000046class MixInCheckStateHandling:
47 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000048 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000049 d = codecs.getincrementaldecoder(encoding)()
50 part1 = d.decode(s[:i])
51 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000052 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 # Check that the condition stated in the documentation for
54 # IncrementalDecoder.getstate() holds
55 if not state[1]:
56 # reset decoder to the default state without anything buffered
57 d.setstate((state[0][:0], 0))
58 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000059 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000060 # The decoder must return to the same state
61 self.assertEqual(state, d.getstate())
62 # Create a new decoder and set it to the state
63 # we extracted from the old one
64 d = codecs.getincrementaldecoder(encoding)()
65 d.setstate(state)
66 part2 = d.decode(s[i:], True)
67 self.assertEqual(u, part1+part2)
68
69 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000070 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000071 d = codecs.getincrementalencoder(encoding)()
72 part1 = d.encode(u[:i])
73 state = d.getstate()
74 d = codecs.getincrementalencoder(encoding)()
75 d.setstate(state)
76 part2 = d.encode(u[i:], True)
77 self.assertEqual(s, part1+part2)
78
Victor Stinnerf96418d2015-09-21 23:06:27 +020079
Ezio Melotti5d3dba02013-01-11 06:02:07 +020080class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000082 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000083 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000084 # the StreamReader and check that the results equal the appropriate
85 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020087 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000091 result += r.read()
92 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000096
Martin Panter7462b6492015-11-02 03:37:02 +000097 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +000098 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000099 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 self.assertEqual(result, partialresult)
103 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 self.assertEqual(d.decode(b"", True), "")
105 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000106
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000109 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 self.assertEqual(result, partialresult)
113 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 self.assertEqual(d.decode(b"", True), "")
115 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000116
117 # check iterdecode()
118 encoded = input.encode(self.encoding)
119 self.assertEqual(
120 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000122 )
123
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 def test_readline(self):
125 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000126 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 return codecs.getreader(self.encoding)(stream)
128
Walter Dörwaldca199432006-03-06 22:39:12 +0000129 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200130 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 lines = []
132 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 if not line:
135 break
136 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
140 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
141 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000142 self.assertEqual(readalllines(s, True), sexpected)
143 self.assertEqual(readalllines(s, False), sexpectednoends)
144 self.assertEqual(readalllines(s, True, 10), sexpected)
145 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000146
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200147 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148 # Test long lines (multiple calls to read() in readline())
149 vw = []
150 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200151 for (i, lineend) in enumerate(lineends):
152 vw.append((i*200+200)*"\u3042" + lineend)
153 vwo.append((i*200+200)*"\u3042")
154 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
155 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000156
157 # Test lines where the first read might end with \r, so the
158 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200160 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000162 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000163 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000164 self.assertEqual(
165 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200168 self.assertEqual(
169 reader.readline(keepends=True),
170 "xxx\n",
171 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000173 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 self.assertEqual(
175 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000176 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200178 self.assertEqual(
179 reader.readline(keepends=False),
180 "xxx",
181 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000182
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200183 def test_mixed_readline_and_read(self):
184 lines = ["Humpty Dumpty sat on a wall,\n",
185 "Humpty Dumpty had a great fall.\r\n",
186 "All the king's horses and all the king's men\r",
187 "Couldn't put Humpty together again."]
188 data = ''.join(lines)
189 def getreader():
190 stream = io.BytesIO(data.encode(self.encoding))
191 return codecs.getreader(self.encoding)(stream)
192
193 # Issue #8260: Test readline() followed by read()
194 f = getreader()
195 self.assertEqual(f.readline(), lines[0])
196 self.assertEqual(f.read(), ''.join(lines[1:]))
197 self.assertEqual(f.read(), '')
198
199 # Issue #16636: Test readline() followed by readlines()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.readlines(), lines[1:])
203 self.assertEqual(f.read(), '')
204
205 # Test read() followed by read()
206 f = getreader()
207 self.assertEqual(f.read(size=40, chars=5), data[:5])
208 self.assertEqual(f.read(), data[5:])
209 self.assertEqual(f.read(), '')
210
211 # Issue #12446: Test read() followed by readlines()
212 f = getreader()
213 self.assertEqual(f.read(size=40, chars=5), data[:5])
214 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
215 self.assertEqual(f.read(), '')
216
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000217 def test_bug1175396(self):
218 s = [
219 '<%!--===================================================\r\n',
220 ' BLOG index page: show recent articles,\r\n',
221 ' today\'s articles, or articles of a specific date.\r\n',
222 '========================================================--%>\r\n',
223 '<%@inputencoding="ISO-8859-1"%>\r\n',
224 '<%@pagetemplate=TEMPLATE.y%>\r\n',
225 '<%@import=import frog.util, frog%>\r\n',
226 '<%@import=import frog.objects%>\r\n',
227 '<%@import=from frog.storageerrors import StorageError%>\r\n',
228 '<%\r\n',
229 '\r\n',
230 'import logging\r\n',
231 'log=logging.getLogger("Snakelets.logger")\r\n',
232 '\r\n',
233 '\r\n',
234 'user=self.SessionCtx.user\r\n',
235 'storageEngine=self.SessionCtx.storageEngine\r\n',
236 '\r\n',
237 '\r\n',
238 'def readArticlesFromDate(date, count=None):\r\n',
239 ' entryids=storageEngine.listBlogEntries(date)\r\n',
240 ' entryids.reverse() # descending\r\n',
241 ' if count:\r\n',
242 ' entryids=entryids[:count]\r\n',
243 ' try:\r\n',
244 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
245 ' except StorageError,x:\r\n',
246 ' log.error("Error loading articles: "+str(x))\r\n',
247 ' self.abort("cannot load articles")\r\n',
248 '\r\n',
249 'showdate=None\r\n',
250 '\r\n',
251 'arg=self.Request.getArg()\r\n',
252 'if arg=="today":\r\n',
253 ' #-------------------- TODAY\'S ARTICLES\r\n',
254 ' self.write("<h2>Today\'s articles</h2>")\r\n',
255 ' showdate = frog.util.isodatestr() \r\n',
256 ' entries = readArticlesFromDate(showdate)\r\n',
257 'elif arg=="active":\r\n',
258 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
259 ' self.Yredirect("active.y")\r\n',
260 'elif arg=="login":\r\n',
261 ' #-------------------- LOGIN PAGE redirect\r\n',
262 ' self.Yredirect("login.y")\r\n',
263 'elif arg=="date":\r\n',
264 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
265 ' showdate = self.Request.getParameter("date")\r\n',
266 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
267 ' entries = readArticlesFromDate(showdate)\r\n',
268 'else:\r\n',
269 ' #-------------------- RECENT ARTICLES\r\n',
270 ' self.write("<h2>Recent articles</h2>")\r\n',
271 ' dates=storageEngine.listBlogEntryDates()\r\n',
272 ' if dates:\r\n',
273 ' entries=[]\r\n',
274 ' SHOWAMOUNT=10\r\n',
275 ' for showdate in dates:\r\n',
276 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
277 ' if len(entries)>=SHOWAMOUNT:\r\n',
278 ' break\r\n',
279 ' \r\n',
280 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000281 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200282 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000283 for (i, line) in enumerate(reader):
284 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000285
286 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000287 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200288 writer = codecs.getwriter(self.encoding)(q)
289 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000290
291 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000292 writer.write("foo\r")
293 self.assertEqual(reader.readline(keepends=False), "foo")
294 writer.write("\nbar\r")
295 self.assertEqual(reader.readline(keepends=False), "")
296 self.assertEqual(reader.readline(keepends=False), "bar")
297 writer.write("baz")
298 self.assertEqual(reader.readline(keepends=False), "baz")
299 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000300
301 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000302 writer.write("foo\r")
303 self.assertEqual(reader.readline(keepends=True), "foo\r")
304 writer.write("\nbar\r")
305 self.assertEqual(reader.readline(keepends=True), "\n")
306 self.assertEqual(reader.readline(keepends=True), "bar\r")
307 writer.write("baz")
308 self.assertEqual(reader.readline(keepends=True), "baz")
309 self.assertEqual(reader.readline(keepends=True), "")
310 writer.write("foo\r\n")
311 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000312
Walter Dörwald9fa09462005-01-10 12:01:39 +0000313 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000314 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
315 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
316 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000317
318 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000319 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200320 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000321 self.assertEqual(reader.readline(), s1)
322 self.assertEqual(reader.readline(), s2)
323 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000324 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000325
326 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
328 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
329 s3 = "stillokay:bbbbxx\r\n"
330 s4 = "broken!!!!badbad\r\n"
331 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000332
333 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000334 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200335 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336 self.assertEqual(reader.readline(), s1)
337 self.assertEqual(reader.readline(), s2)
338 self.assertEqual(reader.readline(), s3)
339 self.assertEqual(reader.readline(), s4)
340 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000341 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000342
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200343 ill_formed_sequence_replace = "\ufffd"
344
345 def test_lone_surrogates(self):
346 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
347 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
348 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200349 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
350 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200351 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
352 "[&#56448;]".encode(self.encoding))
353 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
354 "[]".encode(self.encoding))
355 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
356 "[?]".encode(self.encoding))
357
Victor Stinner01ada392015-10-01 21:54:51 +0200358 # sequential surrogate characters
359 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
360 "[]".encode(self.encoding))
361 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
362 "[??]".encode(self.encoding))
363
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200364 bom = "".encode(self.encoding)
365 for before, after in [("\U00010fff", "A"), ("[", "]"),
366 ("A", "\U00010fff")]:
367 before_sequence = before.encode(self.encoding)[len(bom):]
368 after_sequence = after.encode(self.encoding)[len(bom):]
369 test_string = before + "\uDC80" + after
370 test_sequence = (bom + before_sequence +
371 self.ill_formed_sequence + after_sequence)
372 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
373 self.encoding)
374 self.assertEqual(test_string.encode(self.encoding,
375 "surrogatepass"),
376 test_sequence)
377 self.assertEqual(test_sequence.decode(self.encoding,
378 "surrogatepass"),
379 test_string)
380 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
381 before + after)
382 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
383 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200384 backslashreplace = ''.join('\\x%02x' % b
385 for b in self.ill_formed_sequence)
386 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
387 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388
Victor Stinnerf96418d2015-09-21 23:06:27 +0200389
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200390class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000391 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200392 if sys.byteorder == 'little':
393 ill_formed_sequence = b"\x80\xdc\x00\x00"
394 else:
395 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000396
397 spamle = (b'\xff\xfe\x00\x00'
398 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
399 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
400 spambe = (b'\x00\x00\xfe\xff'
401 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
402 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
403
404 def test_only_one_bom(self):
405 _,_,reader,writer = codecs.lookup(self.encoding)
406 # encode some stream
407 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200408 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000409 f.write("spam")
410 f.write("spam")
411 d = s.getvalue()
412 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000413 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 # try to read it back
415 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200416 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000417 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418
419 def test_badbom(self):
420 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200425 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000426 self.assertRaises(UnicodeError, f.read)
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200430 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000431 [
432 "", # first byte of BOM read
433 "", # second byte of BOM read
434 "", # third byte of BOM read
435 "", # fourth byte of BOM read => byteorder known
436 "",
437 "",
438 "",
439 "\x00",
440 "\x00",
441 "\x00",
442 "\x00",
443 "\x00\xff",
444 "\x00\xff",
445 "\x00\xff",
446 "\x00\xff",
447 "\x00\xff\u0100",
448 "\x00\xff\u0100",
449 "\x00\xff\u0100",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200452 "\x00\xff\u0100\uffff",
453 "\x00\xff\u0100\uffff",
454 "\x00\xff\u0100\uffff",
455 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000456 ]
457 )
458
Georg Brandl791f4e12009-09-17 11:41:24 +0000459 def test_handlers(self):
460 self.assertEqual(('\ufffd', 1),
461 codecs.utf_32_decode(b'\x01', 'replace', True))
462 self.assertEqual(('', 1),
463 codecs.utf_32_decode(b'\x01', 'ignore', True))
464
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 def test_errors(self):
466 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
467 b"\xff", "strict", True)
468
469 def test_decoder_state(self):
470 self.check_state_handling_decode(self.encoding,
471 "spamspam", self.spamle)
472 self.check_state_handling_decode(self.encoding,
473 "spamspam", self.spambe)
474
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000475 def test_issue8941(self):
476 # Issue #8941: insufficient result allocation when decoding into
477 # surrogate pairs on UCS-2 builds.
478 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
479 self.assertEqual('\U00010000' * 1024,
480 codecs.utf_32_decode(encoded_le)[0])
481 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
482 self.assertEqual('\U00010000' * 1024,
483 codecs.utf_32_decode(encoded_be)[0])
484
Victor Stinnerf96418d2015-09-21 23:06:27 +0200485
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200486class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200488 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000489
490 def test_partial(self):
491 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200492 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000493 [
494 "",
495 "",
496 "",
497 "\x00",
498 "\x00",
499 "\x00",
500 "\x00",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff",
504 "\x00\xff",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100",
509 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff",
512 "\x00\xff\u0100\uffff",
513 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000514 ]
515 )
516
517 def test_simple(self):
518 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
519
520 def test_errors(self):
521 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
522 b"\xff", "strict", True)
523
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000524 def test_issue8941(self):
525 # Issue #8941: insufficient result allocation when decoding into
526 # surrogate pairs on UCS-2 builds.
527 encoded = b'\x00\x00\x01\x00' * 1024
528 self.assertEqual('\U00010000' * 1024,
529 codecs.utf_32_le_decode(encoded)[0])
530
Victor Stinnerf96418d2015-09-21 23:06:27 +0200531
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200532class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200534 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000535
536 def test_partial(self):
537 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200538 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000539 [
540 "",
541 "",
542 "",
543 "\x00",
544 "\x00",
545 "\x00",
546 "\x00",
547 "\x00\xff",
548 "\x00\xff",
549 "\x00\xff",
550 "\x00\xff",
551 "\x00\xff\u0100",
552 "\x00\xff\u0100",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200556 "\x00\xff\u0100\uffff",
557 "\x00\xff\u0100\uffff",
558 "\x00\xff\u0100\uffff",
559 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000560 ]
561 )
562
563 def test_simple(self):
564 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
565
566 def test_errors(self):
567 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
568 b"\xff", "strict", True)
569
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000570 def test_issue8941(self):
571 # Issue #8941: insufficient result allocation when decoding into
572 # surrogate pairs on UCS-2 builds.
573 encoded = b'\x00\x01\x00\x00' * 1024
574 self.assertEqual('\U00010000' * 1024,
575 codecs.utf_32_be_decode(encoded)[0])
576
577
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200578class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200580 if sys.byteorder == 'little':
581 ill_formed_sequence = b"\x80\xdc"
582 else:
583 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000585 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
586 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000587
588 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000589 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000591 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200592 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 f.write("spam")
594 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000595 d = s.getvalue()
596 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000597 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000598 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000599 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200600 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000601 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000602
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000603 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000604 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200605 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000606 self.assertRaises(UnicodeError, f.read)
607
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000608 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200609 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000610 self.assertRaises(UnicodeError, f.read)
611
Walter Dörwald69652032004-09-07 20:24:22 +0000612 def test_partial(self):
613 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000615 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000616 "", # first byte of BOM read
617 "", # second byte of BOM read => byteorder known
618 "",
619 "\x00",
620 "\x00",
621 "\x00\xff",
622 "\x00\xff",
623 "\x00\xff\u0100",
624 "\x00\xff\u0100",
625 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200626 "\x00\xff\u0100\uffff",
627 "\x00\xff\u0100\uffff",
628 "\x00\xff\u0100\uffff",
629 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 ]
631 )
632
Georg Brandl791f4e12009-09-17 11:41:24 +0000633 def test_handlers(self):
634 self.assertEqual(('\ufffd', 1),
635 codecs.utf_16_decode(b'\x01', 'replace', True))
636 self.assertEqual(('', 1),
637 codecs.utf_16_decode(b'\x01', 'ignore', True))
638
Walter Dörwalde22d3392005-11-17 08:52:34 +0000639 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000642
643 def test_decoder_state(self):
644 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000645 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000646 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000647 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000648
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000649 def test_bug691291(self):
650 # Files are always opened in binary mode, even if no binary mode was
651 # specified. This means that no automatic conversion of '\n' is done
652 # on reading and writing.
653 s1 = 'Hello\r\nworld\r\n'
654
655 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.addCleanup(support.unlink, support.TESTFN)
657 with open(support.TESTFN, 'wb') as fp:
658 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200659 with support.check_warnings(('', DeprecationWarning)):
660 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
661 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200662 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000663
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200664class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000665 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200666 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000667
668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "",
673 "\x00",
674 "\x00",
675 "\x00\xff",
676 "\x00\xff",
677 "\x00\xff\u0100",
678 "\x00\xff\u0100",
679 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200680 "\x00\xff\u0100\uffff",
681 "\x00\xff\u0100\uffff",
682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000684 ]
685 )
686
Walter Dörwalde22d3392005-11-17 08:52:34 +0000687 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200688 tests = [
689 (b'\xff', '\ufffd'),
690 (b'A\x00Z', 'A\ufffd'),
691 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
692 (b'\x00\xd8', '\ufffd'),
693 (b'\x00\xd8A', '\ufffd'),
694 (b'\x00\xd8A\x00', '\ufffdA'),
695 (b'\x00\xdcA\x00', '\ufffdA'),
696 ]
697 for raw, expected in tests:
698 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
699 raw, 'strict', True)
700 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000701
Victor Stinner53a9dd72010-12-08 22:25:45 +0000702 def test_nonbmp(self):
703 self.assertEqual("\U00010203".encode(self.encoding),
704 b'\x00\xd8\x03\xde')
705 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
706 "\U00010203")
707
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200708class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000709 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200710 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000711
712 def test_partial(self):
713 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200714 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000715 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000716 "",
717 "\x00",
718 "\x00",
719 "\x00\xff",
720 "\x00\xff",
721 "\x00\xff\u0100",
722 "\x00\xff\u0100",
723 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200724 "\x00\xff\u0100\uffff",
725 "\x00\xff\u0100\uffff",
726 "\x00\xff\u0100\uffff",
727 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000728 ]
729 )
730
Walter Dörwalde22d3392005-11-17 08:52:34 +0000731 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200732 tests = [
733 (b'\xff', '\ufffd'),
734 (b'\x00A\xff', 'A\ufffd'),
735 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
736 (b'\xd8\x00', '\ufffd'),
737 (b'\xd8\x00\xdc', '\ufffd'),
738 (b'\xd8\x00\x00A', '\ufffdA'),
739 (b'\xdc\x00\x00A', '\ufffdA'),
740 ]
741 for raw, expected in tests:
742 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
743 raw, 'strict', True)
744 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745
Victor Stinner53a9dd72010-12-08 22:25:45 +0000746 def test_nonbmp(self):
747 self.assertEqual("\U00010203".encode(self.encoding),
748 b'\xd8\x00\xde\x03')
749 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
750 "\U00010203")
751
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200752class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000753 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200754 ill_formed_sequence = b"\xed\xb2\x80"
755 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200756 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000757
758 def test_partial(self):
759 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200760 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000761 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000762 "\x00",
763 "\x00",
764 "\x00\xff",
765 "\x00\xff",
766 "\x00\xff\u07ff",
767 "\x00\xff\u07ff",
768 "\x00\xff\u07ff",
769 "\x00\xff\u07ff\u0800",
770 "\x00\xff\u07ff\u0800",
771 "\x00\xff\u07ff\u0800",
772 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200773 "\x00\xff\u07ff\u0800\uffff",
774 "\x00\xff\u07ff\u0800\uffff",
775 "\x00\xff\u07ff\u0800\uffff",
776 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000777 ]
778 )
779
Walter Dörwald3abcb012007-04-16 22:10:50 +0000780 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000782 self.check_state_handling_decode(self.encoding,
783 u, u.encode(self.encoding))
784
Victor Stinner1d65d912015-10-05 13:43:50 +0200785 def test_decode_error(self):
786 for data, error_handler, expected in (
787 (b'[\x80\xff]', 'ignore', '[]'),
788 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
789 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
790 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
791 ):
792 with self.subTest(data=data, error_handler=error_handler,
793 expected=expected):
794 self.assertEqual(data.decode(self.encoding, error_handler),
795 expected)
796
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000797 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200798 super().test_lone_surrogates()
799 # not sure if this is making sense for
800 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200801 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
802 self.BOM + b'[\x80]')
803
804 with self.assertRaises(UnicodeEncodeError) as cm:
805 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
806 exc = cm.exception
807 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000808
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000809 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200810 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
811 self.BOM + b"abc\xed\xa0\x80def")
812 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
813 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
814 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
815 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
816
817 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000818 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200819 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200821
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000822 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700823 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200824 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200825 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200826 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000827
Victor Stinnerf96418d2015-09-21 23:06:27 +0200828
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200829@unittest.skipUnless(sys.platform == 'win32',
830 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200831class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200832 encoding = "cp65001"
833
834 def test_encode(self):
835 tests = [
836 ('abc', 'strict', b'abc'),
837 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
838 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700839 ('\udc80', 'strict', None),
840 ('\udc80', 'ignore', b''),
841 ('\udc80', 'replace', b'?'),
842 ('\udc80', 'backslashreplace', b'\\udc80'),
843 ('\udc80', 'namereplace', b'\\udc80'),
844 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200845 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200846 for text, errors, expected in tests:
847 if expected is not None:
848 try:
849 encoded = text.encode('cp65001', errors)
850 except UnicodeEncodeError as err:
851 self.fail('Unable to encode %a to cp65001 with '
852 'errors=%r: %s' % (text, errors, err))
853 self.assertEqual(encoded, expected,
854 '%a.encode("cp65001", %r)=%a != %a'
855 % (text, errors, encoded, expected))
856 else:
857 self.assertRaises(UnicodeEncodeError,
858 text.encode, "cp65001", errors)
859
860 def test_decode(self):
861 tests = [
862 (b'abc', 'strict', 'abc'),
863 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
864 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
865 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
866 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
867 # invalid bytes
868 (b'[\xff]', 'strict', None),
869 (b'[\xff]', 'ignore', '[]'),
870 (b'[\xff]', 'replace', '[\ufffd]'),
871 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700872 (b'[\xed\xb2\x80]', 'strict', None),
873 (b'[\xed\xb2\x80]', 'ignore', '[]'),
874 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200875 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200876 for raw, errors, expected in tests:
877 if expected is not None:
878 try:
879 decoded = raw.decode('cp65001', errors)
880 except UnicodeDecodeError as err:
881 self.fail('Unable to decode %a from cp65001 with '
882 'errors=%r: %s' % (raw, errors, err))
883 self.assertEqual(decoded, expected,
884 '%a.decode("cp65001", %r)=%a != %a'
885 % (raw, errors, decoded, expected))
886 else:
887 self.assertRaises(UnicodeDecodeError,
888 raw.decode, 'cp65001', errors)
889
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200890 def test_lone_surrogates(self):
891 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
892 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
893 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
894 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200895 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
896 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200897 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
898 b'[&#56448;]')
899 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
900 b'[\x80]')
901 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
902 b'[]')
903 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
904 b'[?]')
905
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200906 def test_surrogatepass_handler(self):
907 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
908 b"abc\xed\xa0\x80def")
909 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
910 "abc\ud800def")
911 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
912 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
913 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
914 "\U00010fff\uD800")
915 self.assertTrue(codecs.lookup_error("surrogatepass"))
916
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200917
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200918class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000919 encoding = "utf-7"
920
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300921 def test_ascii(self):
922 # Set D (directly encoded characters)
923 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
924 'abcdefghijklmnopqrstuvwxyz'
925 '0123456789'
926 '\'(),-./:?')
927 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
928 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
929 # Set O (optional direct characters)
930 set_o = ' !"#$%&*;<=>@[]^_`{|}'
931 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
932 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
933 # +
934 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
935 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
936 # White spaces
937 ws = ' \t\n\r'
938 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
939 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
940 # Other ASCII characters
941 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
942 set(set_d + set_o + '+' + ws)))
943 self.assertEqual(other_ascii.encode(self.encoding),
944 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
945 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
946
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000947 def test_partial(self):
948 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200949 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000950 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200951 'a',
952 'a',
953 'a+',
954 'a+-',
955 'a+-b',
956 'a+-b',
957 'a+-b',
958 'a+-b',
959 'a+-b',
960 'a+-b\x00',
961 'a+-b\x00c',
962 'a+-b\x00c',
963 'a+-b\x00c',
964 'a+-b\x00c',
965 'a+-b\x00c',
966 'a+-b\x00c\x80',
967 'a+-b\x00c\x80d',
968 'a+-b\x00c\x80d',
969 'a+-b\x00c\x80d',
970 'a+-b\x00c\x80d',
971 'a+-b\x00c\x80d',
972 'a+-b\x00c\x80d\u0100',
973 'a+-b\x00c\x80d\u0100e',
974 'a+-b\x00c\x80d\u0100e',
975 'a+-b\x00c\x80d\u0100e',
976 'a+-b\x00c\x80d\u0100e',
977 'a+-b\x00c\x80d\u0100e',
978 'a+-b\x00c\x80d\u0100e',
979 'a+-b\x00c\x80d\u0100e',
980 'a+-b\x00c\x80d\u0100e',
981 'a+-b\x00c\x80d\u0100e\U00010000',
982 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000983 ]
984 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000985
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300986 def test_errors(self):
987 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300988 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300989 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300990 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300991 (b'a+IK', 'a\ufffd'),
992 (b'a+IK-b', 'a\ufffdb'),
993 (b'a+IK,b', 'a\ufffdb'),
994 (b'a+IKx', 'a\u20ac\ufffd'),
995 (b'a+IKx-b', 'a\u20ac\ufffdb'),
996 (b'a+IKwgr', 'a\u20ac\ufffd'),
997 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
998 (b'a+IKwgr,', 'a\u20ac\ufffd'),
999 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1000 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1001 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1002 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1003 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1004 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1005 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001006 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1007 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001008 ]
1009 for raw, expected in tests:
1010 with self.subTest(raw=raw):
1011 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1012 raw, 'strict', True)
1013 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1014
1015 def test_nonbmp(self):
1016 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1017 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1018 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001019 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1020 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1021 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1022 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1023 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1024 b'+IKwgrNgB3KA-')
1025 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1026 '\u20ac\u20ac\U000104A0')
1027 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1028 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001029
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001030 def test_lone_surrogates(self):
1031 tests = [
1032 (b'a+2AE-b', 'a\ud801b'),
1033 (b'a+2AE\xffb', 'a\ufffdb'),
1034 (b'a+2AE', 'a\ufffd'),
1035 (b'a+2AEA-b', 'a\ufffdb'),
1036 (b'a+2AH-b', 'a\ufffdb'),
1037 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1038 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1039 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1040 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1041 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1042 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1043 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1044 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1045 ]
1046 for raw, expected in tests:
1047 with self.subTest(raw=raw):
1048 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001049
1050
Walter Dörwalde22d3392005-11-17 08:52:34 +00001051class UTF16ExTest(unittest.TestCase):
1052
1053 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001054 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001055
1056 def test_bad_args(self):
1057 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1058
1059class ReadBufferTest(unittest.TestCase):
1060
1061 def test_array(self):
1062 import array
1063 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001064 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001065 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001066 )
1067
1068 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001069 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001070
1071 def test_bad_args(self):
1072 self.assertRaises(TypeError, codecs.readbuffer_encode)
1073 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1074
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001075class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001076 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001077 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001078
1079 def test_partial(self):
1080 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001081 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001082 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001083 "",
1084 "",
1085 "", # First BOM has been read and skipped
1086 "",
1087 "",
1088 "\ufeff", # Second BOM has been read and emitted
1089 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001090 "\ufeff\x00", # First byte of encoded "\xff" read
1091 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1092 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1093 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001094 "\ufeff\x00\xff\u07ff",
1095 "\ufeff\x00\xff\u07ff",
1096 "\ufeff\x00\xff\u07ff\u0800",
1097 "\ufeff\x00\xff\u07ff\u0800",
1098 "\ufeff\x00\xff\u07ff\u0800",
1099 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001100 "\ufeff\x00\xff\u07ff\u0800\uffff",
1101 "\ufeff\x00\xff\u07ff\u0800\uffff",
1102 "\ufeff\x00\xff\u07ff\u0800\uffff",
1103 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001104 ]
1105 )
1106
Thomas Wouters89f507f2006-12-13 04:49:30 +00001107 def test_bug1601501(self):
1108 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001109 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001110
Walter Dörwald3abcb012007-04-16 22:10:50 +00001111 def test_bom(self):
1112 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001114 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1115
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001116 def test_stream_bom(self):
1117 unistring = "ABC\u00A1\u2200XYZ"
1118 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1119
1120 reader = codecs.getreader("utf-8-sig")
1121 for sizehint in [None] + list(range(1, 11)) + \
1122 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001123 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001124 ostream = io.StringIO()
1125 while 1:
1126 if sizehint is not None:
1127 data = istream.read(sizehint)
1128 else:
1129 data = istream.read()
1130
1131 if not data:
1132 break
1133 ostream.write(data)
1134
1135 got = ostream.getvalue()
1136 self.assertEqual(got, unistring)
1137
1138 def test_stream_bare(self):
1139 unistring = "ABC\u00A1\u2200XYZ"
1140 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1141
1142 reader = codecs.getreader("utf-8-sig")
1143 for sizehint in [None] + list(range(1, 11)) + \
1144 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001145 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001146 ostream = io.StringIO()
1147 while 1:
1148 if sizehint is not None:
1149 data = istream.read(sizehint)
1150 else:
1151 data = istream.read()
1152
1153 if not data:
1154 break
1155 ostream.write(data)
1156
1157 got = ostream.getvalue()
1158 self.assertEqual(got, unistring)
1159
1160class EscapeDecodeTest(unittest.TestCase):
1161 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001162 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001163 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001164
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001165 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001166 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001167 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001168 b = bytes([b])
1169 if b != b'\\':
1170 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001171
1172 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001173 decode = codecs.escape_decode
1174 check = coding_checker(self, decode)
1175 check(b"[\\\n]", b"[]")
1176 check(br'[\"]', b'["]')
1177 check(br"[\']", b"[']")
R David Murray110b6fe2016-09-08 15:34:08 -04001178 check(br"[\\]", b"[\\]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001179 check(br"[\a]", b"[\x07]")
1180 check(br"[\b]", b"[\x08]")
1181 check(br"[\t]", b"[\x09]")
1182 check(br"[\n]", b"[\x0a]")
1183 check(br"[\v]", b"[\x0b]")
1184 check(br"[\f]", b"[\x0c]")
1185 check(br"[\r]", b"[\x0d]")
1186 check(br"[\7]", b"[\x07]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 check(br"[\78]", b"[\x078]")
1188 check(br"[\41]", b"[!]")
1189 check(br"[\418]", b"[!8]")
1190 check(br"[\101]", b"[A]")
1191 check(br"[\1010]", b"[A0]")
1192 check(br"[\501]", b"[A]")
1193 check(br"[\x41]", b"[A]")
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001194 check(br"[\x410]", b"[A0]")
R David Murray110b6fe2016-09-08 15:34:08 -04001195 for i in range(97, 123):
1196 b = bytes([i])
1197 if b not in b'abfnrtvx':
1198 with self.assertWarns(DeprecationWarning):
1199 check(b"\\" + b, b"\\" + b)
1200 with self.assertWarns(DeprecationWarning):
1201 check(b"\\" + b.upper(), b"\\" + b.upper())
1202 with self.assertWarns(DeprecationWarning):
1203 check(br"\8", b"\\8")
1204 with self.assertWarns(DeprecationWarning):
1205 check(br"\9", b"\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03001206 with self.assertWarns(DeprecationWarning):
1207 check(b"\\\xfa", b"\\\xfa")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001208
1209 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001210 decode = codecs.escape_decode
1211 self.assertRaises(ValueError, decode, br"\x")
1212 self.assertRaises(ValueError, decode, br"[\x]")
1213 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1214 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1215 self.assertRaises(ValueError, decode, br"\x0")
1216 self.assertRaises(ValueError, decode, br"[\x0]")
1217 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1218 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001219
Victor Stinnerf96418d2015-09-21 23:06:27 +02001220
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001221class RecodingTest(unittest.TestCase):
1222 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001223 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001224 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001226 f2.close()
1227 # Python used to crash on this at exit because of a refcount
1228 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001229
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001230 self.assertTrue(f.closed)
1231
Martin v. Löwis2548c732003-04-18 10:39:54 +00001232# From RFC 3492
1233punycode_testcases = [
1234 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001235 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1236 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001237 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001239 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001240 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001242 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001243 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001244 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001245 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1246 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1247 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001248 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001250 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1251 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1252 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001253 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1257 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1258 "\u0939\u0948\u0902",
1259 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260
1261 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001262 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001263 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1264 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265
1266 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1268 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1269 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001270 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1271 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272
1273 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1275 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1276 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1277 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001278 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001279
1280 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001281 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1282 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1283 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1284 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1285 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001286 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287
1288 # (K) Vietnamese:
1289 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1290 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001291 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1292 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1293 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1294 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001295 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001296
Martin v. Löwis2548c732003-04-18 10:39:54 +00001297 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001298 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001299 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001300
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001302 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1303 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1304 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001305 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001306
1307 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001308 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1309 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1310 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001311 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001312
1313 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001315 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316
1317 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1319 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001320 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321
1322 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001323 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001324 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325
1326 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001327 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001328 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001329
1330 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001331 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1332 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001333 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334 ]
1335
1336for i in punycode_testcases:
1337 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001338 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339
Victor Stinnerf96418d2015-09-21 23:06:27 +02001340
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341class PunycodeTest(unittest.TestCase):
1342 def test_encode(self):
1343 for uni, puny in punycode_testcases:
1344 # Need to convert both strings to lower case, since
1345 # some of the extended encodings use upper case, but our
1346 # code produces only lower case. Converting just puny to
1347 # lower is also insufficient, since some of the input characters
1348 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001349 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001350 str(uni.encode("punycode"), "ascii").lower(),
1351 str(puny, "ascii").lower()
1352 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353
1354 def test_decode(self):
1355 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001356 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001357 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001358 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359
Victor Stinnerf96418d2015-09-21 23:06:27 +02001360
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001361class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001362 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001363 def test_bug1251300(self):
1364 # Decoding with unicode_internal used to not correctly handle "code
1365 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001366 ok = [
1367 (b"\x00\x10\xff\xff", "\U0010ffff"),
1368 (b"\x00\x00\x01\x01", "\U00000101"),
1369 (b"", ""),
1370 ]
1371 not_ok = [
1372 b"\x7f\xff\xff\xff",
1373 b"\x80\x00\x00\x00",
1374 b"\x81\x00\x00\x00",
1375 b"\x00",
1376 b"\x00\x00\x00\x00\x00",
1377 ]
1378 for internal, uni in ok:
1379 if sys.byteorder == "little":
1380 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001381 with support.check_warnings():
1382 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001383 for internal in not_ok:
1384 if sys.byteorder == "little":
1385 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001386 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001387 'deprecated', DeprecationWarning)):
1388 self.assertRaises(UnicodeDecodeError, internal.decode,
1389 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001390 if sys.byteorder == "little":
1391 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001392 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001393 else:
1394 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001395 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001396 with support.check_warnings():
1397 self.assertRaises(UnicodeDecodeError,
1398 invalid.decode, "unicode_internal")
1399 with support.check_warnings():
1400 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1401 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001402 with support.check_warnings():
1403 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1404 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001405
Victor Stinner182d90d2011-09-29 19:53:55 +02001406 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001407 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001408 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001409 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001410 'deprecated', DeprecationWarning)):
1411 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001412 except UnicodeDecodeError as ex:
1413 self.assertEqual("unicode_internal", ex.encoding)
1414 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1415 self.assertEqual(4, ex.start)
1416 self.assertEqual(8, ex.end)
1417 else:
1418 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001419
Victor Stinner182d90d2011-09-29 19:53:55 +02001420 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001421 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001422 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1423 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001424 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001425 'deprecated', DeprecationWarning)):
1426 ab = "ab".encode("unicode_internal").decode()
1427 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1428 "ascii"),
1429 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001430 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001431
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001432 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001433 with support.check_warnings(('unicode_internal codec has been '
1434 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001435 # Issue 3739
1436 encoder = codecs.getencoder("unicode_internal")
1437 self.assertEqual(encoder("a")[1], 1)
1438 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1439
1440 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001441
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1443nameprep_tests = [
1444 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001445 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1446 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1447 b'\xb8\x8f\xef\xbb\xbf',
1448 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001449 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'CAFE',
1451 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001452 # 3.3 Case folding 8bit U+00DF (german sharp s).
1453 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001454 (b'\xc3\x9f',
1455 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'\xc4\xb0',
1458 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001460 (b'\xc5\x83\xcd\xba',
1461 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1463 # XXX: skip this as it fails in UCS-2 mode
1464 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1465 # 'telc\xe2\x88\x95kg\xcf\x83'),
1466 (None, None),
1467 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1469 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'\xe1\xbe\xb7',
1472 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.9 Self-reverting case folding U+01F0 and normalization.
1474 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001475 (b'\xc7\xb0',
1476 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001478 (b'\xce\x90',
1479 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'\xce\xb0',
1482 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 (b'\xe1\xba\x96',
1485 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'\xe1\xbd\x96',
1488 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001490 (b' ',
1491 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 (b'\xc2\xa0',
1494 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001495 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 None),
1498 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'\xe2\x80\x80',
1500 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001502 (b'\xe2\x80\x8b',
1503 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 (b'\xe3\x80\x80',
1506 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001508 (b'\x10\x7f',
1509 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001511 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001512 None),
1513 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001514 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001515 None),
1516 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001517 (b'\xef\xbb\xbf',
1518 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001521 None),
1522 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001523 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001524 None),
1525 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001526 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001527 None),
1528 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001529 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001530 None),
1531 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001532 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001533 None),
1534 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001535 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001536 None),
1537 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001538 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 None),
1540 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001541 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001542 None),
1543 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001544 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001545 None),
1546 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 (b'\xcd\x81',
1548 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001549 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001550 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001551 None),
1552 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001553 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001554 None),
1555 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001556 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001557 None),
1558 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001559 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001560 None),
1561 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001562 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001563 None),
1564 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001565 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001566 None),
1567 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 (b'foo\xef\xb9\xb6bar',
1569 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001570 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001572 None),
1573 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 (b'\xd8\xa71\xd8\xa8',
1575 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001576 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001577 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001578 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001579 # None),
1580 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001581 # 3.44 Larger test (shrinking).
1582 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1584 b'\xaa\xce\xb0\xe2\x80\x80',
1585 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001586 # 3.45 Larger test (expanding).
1587 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001588 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1589 b'\x80',
1590 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1591 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1592 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001593 ]
1594
1595
1596class NameprepTest(unittest.TestCase):
1597 def test_nameprep(self):
1598 from encodings.idna import nameprep
1599 for pos, (orig, prepped) in enumerate(nameprep_tests):
1600 if orig is None:
1601 # Skipped
1602 continue
1603 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001604 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001605 if prepped is None:
1606 # Input contains prohibited characters
1607 self.assertRaises(UnicodeError, nameprep, orig)
1608 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001609 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001610 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001611 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001612 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001613 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001614
Victor Stinnerf96418d2015-09-21 23:06:27 +02001615
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001616class IDNACodecTest(unittest.TestCase):
1617 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001618 self.assertEqual(str(b"python.org", "idna"), "python.org")
1619 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1620 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1621 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001622
1623 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001624 self.assertEqual("python.org".encode("idna"), b"python.org")
1625 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1626 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1627 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001628
Martin v. Löwis8b595142005-08-25 11:03:38 +00001629 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001630 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001631 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001633
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001634 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001636 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001637 "python.org"
1638 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001639 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001640 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001641 "python.org."
1642 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001643 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001644 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001645 "pyth\xf6n.org."
1646 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001647 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001648 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 "pyth\xf6n.org."
1650 )
1651
1652 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001653 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1654 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1655 self.assertEqual(decoder.decode(b"rg"), "")
1656 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001657
1658 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001659 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1660 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1661 self.assertEqual(decoder.decode(b"rg."), "org.")
1662 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001663
1664 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001665 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001666 b"".join(codecs.iterencode("python.org", "idna")),
1667 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001668 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001669 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001670 b"".join(codecs.iterencode("python.org.", "idna")),
1671 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001672 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001673 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001674 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1675 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001676 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001677 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001678 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1679 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001680 )
1681
1682 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001683 self.assertEqual(encoder.encode("\xe4x"), b"")
1684 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1685 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001686
1687 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(encoder.encode("\xe4x"), b"")
1689 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1690 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001691
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001692 def test_errors(self):
1693 """Only supports "strict" error handler"""
1694 "python.org".encode("idna", "strict")
1695 b"python.org".decode("idna", "strict")
1696 for errors in ("ignore", "replace", "backslashreplace",
1697 "surrogateescape"):
1698 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1699 self.assertRaises(Exception,
1700 b"python.org".decode, "idna", errors)
1701
Victor Stinnerf96418d2015-09-21 23:06:27 +02001702
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001703class CodecsModuleTest(unittest.TestCase):
1704
1705 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001706 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1707 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001708 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001709 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001710 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001711
Victor Stinnera57dfd02014-05-14 17:13:14 +02001712 # test keywords
1713 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1714 '\xe4\xf6\xfc')
1715 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1716 '[]')
1717
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001718 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001719 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1720 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001721 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001722 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001723 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001724 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001725
Victor Stinnera57dfd02014-05-14 17:13:14 +02001726 # test keywords
1727 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1728 b'\xe4\xf6\xfc')
1729 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1730 b'[]')
1731
Walter Dörwald063e1e82004-10-28 13:04:26 +00001732 def test_register(self):
1733 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001734 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001735
1736 def test_lookup(self):
1737 self.assertRaises(TypeError, codecs.lookup)
1738 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001739 self.assertRaises(LookupError, codecs.lookup, " ")
1740
1741 def test_getencoder(self):
1742 self.assertRaises(TypeError, codecs.getencoder)
1743 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1744
1745 def test_getdecoder(self):
1746 self.assertRaises(TypeError, codecs.getdecoder)
1747 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1748
1749 def test_getreader(self):
1750 self.assertRaises(TypeError, codecs.getreader)
1751 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1752
1753 def test_getwriter(self):
1754 self.assertRaises(TypeError, codecs.getwriter)
1755 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001756
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001757 def test_lookup_issue1813(self):
1758 # Issue #1813: under Turkish locales, lookup of some codecs failed
1759 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001760 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001761 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1762 try:
1763 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1764 except locale.Error:
1765 # Unsupported locale on this system
1766 self.skipTest('test needs Turkish locale')
1767 c = codecs.lookup('ASCII')
1768 self.assertEqual(c.name, 'ascii')
1769
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001770 def test_all(self):
1771 api = (
1772 "encode", "decode",
1773 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1774 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1775 "getencoder", "getdecoder", "getincrementalencoder",
1776 "getincrementaldecoder", "getreader", "getwriter",
1777 "register_error", "lookup_error",
1778 "strict_errors", "replace_errors", "ignore_errors",
1779 "xmlcharrefreplace_errors", "backslashreplace_errors",
1780 "namereplace_errors",
1781 "open", "EncodedFile",
1782 "iterencode", "iterdecode",
1783 "BOM", "BOM_BE", "BOM_LE",
1784 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1785 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1786 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1787 "StreamReaderWriter", "StreamRecoder",
1788 )
1789 self.assertCountEqual(api, codecs.__all__)
1790 for api in codecs.__all__:
1791 getattr(codecs, api)
1792
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001793 def test_open(self):
1794 self.addCleanup(support.unlink, support.TESTFN)
1795 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1796 with self.subTest(mode), \
1797 codecs.open(support.TESTFN, mode, 'ascii') as file:
1798 self.assertIsInstance(file, codecs.StreamReaderWriter)
1799
1800 def test_undefined(self):
1801 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1802 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1803 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1804 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1805 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1806 self.assertRaises(UnicodeError,
1807 codecs.encode, 'abc', 'undefined', errors)
1808 self.assertRaises(UnicodeError,
1809 codecs.decode, b'abc', 'undefined', errors)
1810
Victor Stinnerf96418d2015-09-21 23:06:27 +02001811
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001812class StreamReaderTest(unittest.TestCase):
1813
1814 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001815 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001816 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001817
1818 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001819 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001820 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001821
Victor Stinnerf96418d2015-09-21 23:06:27 +02001822
Thomas Wouters89f507f2006-12-13 04:49:30 +00001823class EncodedFileTest(unittest.TestCase):
1824
1825 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001826 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001827 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001828 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001829
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001830 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001831 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001832 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001833 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001834
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001835all_unicode_encodings = [
1836 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001837 "big5",
1838 "big5hkscs",
1839 "charmap",
1840 "cp037",
1841 "cp1006",
1842 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001843 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001844 "cp1140",
1845 "cp1250",
1846 "cp1251",
1847 "cp1252",
1848 "cp1253",
1849 "cp1254",
1850 "cp1255",
1851 "cp1256",
1852 "cp1257",
1853 "cp1258",
1854 "cp424",
1855 "cp437",
1856 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001857 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858 "cp737",
1859 "cp775",
1860 "cp850",
1861 "cp852",
1862 "cp855",
1863 "cp856",
1864 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001865 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001866 "cp860",
1867 "cp861",
1868 "cp862",
1869 "cp863",
1870 "cp864",
1871 "cp865",
1872 "cp866",
1873 "cp869",
1874 "cp874",
1875 "cp875",
1876 "cp932",
1877 "cp949",
1878 "cp950",
1879 "euc_jis_2004",
1880 "euc_jisx0213",
1881 "euc_jp",
1882 "euc_kr",
1883 "gb18030",
1884 "gb2312",
1885 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886 "hp_roman8",
1887 "hz",
1888 "idna",
1889 "iso2022_jp",
1890 "iso2022_jp_1",
1891 "iso2022_jp_2",
1892 "iso2022_jp_2004",
1893 "iso2022_jp_3",
1894 "iso2022_jp_ext",
1895 "iso2022_kr",
1896 "iso8859_1",
1897 "iso8859_10",
1898 "iso8859_11",
1899 "iso8859_13",
1900 "iso8859_14",
1901 "iso8859_15",
1902 "iso8859_16",
1903 "iso8859_2",
1904 "iso8859_3",
1905 "iso8859_4",
1906 "iso8859_5",
1907 "iso8859_6",
1908 "iso8859_7",
1909 "iso8859_8",
1910 "iso8859_9",
1911 "johab",
1912 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001913 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001914 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001915 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001916 "latin_1",
1917 "mac_cyrillic",
1918 "mac_greek",
1919 "mac_iceland",
1920 "mac_latin2",
1921 "mac_roman",
1922 "mac_turkish",
1923 "palmos",
1924 "ptcp154",
1925 "punycode",
1926 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001927 "shift_jis",
1928 "shift_jis_2004",
1929 "shift_jisx0213",
1930 "tis_620",
1931 "unicode_escape",
1932 "unicode_internal",
1933 "utf_16",
1934 "utf_16_be",
1935 "utf_16_le",
1936 "utf_7",
1937 "utf_8",
1938]
1939
1940if hasattr(codecs, "mbcs_encode"):
1941 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001942if hasattr(codecs, "oem_encode"):
1943 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001944
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001945# The following encoding is not tested, because it's not supposed
1946# to work:
1947# "undefined"
1948
1949# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001950broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001951 "punycode",
1952 "unicode_internal"
1953]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001954
Victor Stinnerf96418d2015-09-21 23:06:27 +02001955
Walter Dörwald3abcb012007-04-16 22:10:50 +00001956class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001957 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001958 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001959 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001960 name = codecs.lookup(encoding).name
1961 if encoding.endswith("_codec"):
1962 name += "_codec"
1963 elif encoding == "latin_1":
1964 name = "latin_1"
1965 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001966
Ezio Melottiadc417c2011-11-17 12:23:34 +02001967 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001968 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001969 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001970 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001971 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001972 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001973
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001974 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001975 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001976 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001977 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001978 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001979 for c in s:
1980 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001981 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001982 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001983 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001984 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001985 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001986 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001987 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001988 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001989 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001990 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001991
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001992 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001993 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001994 try:
1995 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001996 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001997 pass
1998 else:
1999 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002000 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00002001 for c in s:
2002 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002003 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00002004 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002005 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00002006 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002007 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002008 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002009 self.assertEqual(decodedresult, s,
2010 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002011
2012 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002013 result = "".join(codecs.iterdecode(
2014 codecs.iterencode(s, encoding), encoding))
2015 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002016
2017 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002018 result = "".join(codecs.iterdecode(
2019 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002020 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002021
Victor Stinner554f3f02010-06-16 23:33:54 +00002022 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002023 # check incremental decoder/encoder with errors argument
2024 try:
2025 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002026 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002027 pass
2028 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002029 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002030 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002031 decodedresult = "".join(decoder.decode(bytes([c]))
2032 for c in encodedresult)
2033 self.assertEqual(decodedresult, s,
2034 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002035
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002036 @support.cpython_only
2037 def test_basics_capi(self):
2038 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2039 s = "abc123" # all codecs should be able to encode these
2040 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002041 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002042 # check incremental decoder/encoder (fetched via the C API)
2043 try:
2044 cencoder = codec_incrementalencoder(encoding)
2045 except LookupError: # no IncrementalEncoder
2046 pass
2047 else:
2048 # check C API
2049 encodedresult = b""
2050 for c in s:
2051 encodedresult += cencoder.encode(c)
2052 encodedresult += cencoder.encode("", True)
2053 cdecoder = codec_incrementaldecoder(encoding)
2054 decodedresult = ""
2055 for c in encodedresult:
2056 decodedresult += cdecoder.decode(bytes([c]))
2057 decodedresult += cdecoder.decode(b"", True)
2058 self.assertEqual(decodedresult, s,
2059 "encoding=%r" % encoding)
2060
2061 if encoding not in ("idna", "mbcs"):
2062 # check incremental decoder/encoder with errors argument
2063 try:
2064 cencoder = codec_incrementalencoder(encoding, "ignore")
2065 except LookupError: # no IncrementalEncoder
2066 pass
2067 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002068 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002069 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2070 decodedresult = "".join(cdecoder.decode(bytes([c]))
2071 for c in encodedresult)
2072 self.assertEqual(decodedresult, s,
2073 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002074
Walter Dörwald729c31f2005-03-14 19:06:30 +00002075 def test_seek(self):
2076 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002077 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002078 for encoding in all_unicode_encodings:
2079 if encoding == "idna": # FIXME: See SF bug #1163178
2080 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002081 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002082 continue
Victor Stinner05010702011-05-27 16:50:40 +02002083 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002084 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002085 # Test that calling seek resets the internal codec state and buffers
2086 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002087 data = reader.read()
2088 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002089
Walter Dörwalde22d3392005-11-17 08:52:34 +00002090 def test_bad_decode_args(self):
2091 for encoding in all_unicode_encodings:
2092 decoder = codecs.getdecoder(encoding)
2093 self.assertRaises(TypeError, decoder)
2094 if encoding not in ("idna", "punycode"):
2095 self.assertRaises(TypeError, decoder, 42)
2096
2097 def test_bad_encode_args(self):
2098 for encoding in all_unicode_encodings:
2099 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002100 with support.check_warnings():
2101 # unicode-internal has been deprecated
2102 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002104 def test_encoding_map_type_initialized(self):
2105 from encodings import cp1140
2106 # This used to crash, we are only verifying there's no crash.
2107 table_type = type(cp1140.encoding_table)
2108 self.assertEqual(table_type, table_type)
2109
Walter Dörwald3abcb012007-04-16 22:10:50 +00002110 def test_decoder_state(self):
2111 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002112 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002113 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002114 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002115 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2116 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2117
Victor Stinnerf96418d2015-09-21 23:06:27 +02002118
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002119class CharmapTest(unittest.TestCase):
2120 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002121 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002122 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002123 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002124 )
2125
Ezio Melottib3aedd42010-11-20 19:04:17 +00002126 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002127 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2128 ("\U0010FFFFbc", 3)
2129 )
2130
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002131 self.assertRaises(UnicodeDecodeError,
2132 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2133 )
2134
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002135 self.assertRaises(UnicodeDecodeError,
2136 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2137 )
2138
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002139 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002140 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002141 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002142 )
2143
Ezio Melottib3aedd42010-11-20 19:04:17 +00002144 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002145 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002146 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002147 )
2148
Ezio Melottib3aedd42010-11-20 19:04:17 +00002149 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002150 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2151 ("ab\\x02", 3)
2152 )
2153
2154 self.assertEqual(
2155 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2156 ("ab\\x02", 3)
2157 )
2158
2159 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002160 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002161 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002162 )
2163
Ezio Melottib3aedd42010-11-20 19:04:17 +00002164 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002165 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002166 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002167 )
2168
Guido van Rossum805365e2007-05-07 22:24:25 +00002169 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002170 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002171 codecs.charmap_decode(allbytes, "ignore", ""),
2172 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002173 )
2174
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002175 def test_decode_with_int2str_map(self):
2176 self.assertEqual(
2177 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2178 {0: 'a', 1: 'b', 2: 'c'}),
2179 ("abc", 3)
2180 )
2181
2182 self.assertEqual(
2183 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2184 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2185 ("AaBbCc", 3)
2186 )
2187
2188 self.assertEqual(
2189 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2190 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2191 ("\U0010FFFFbc", 3)
2192 )
2193
2194 self.assertEqual(
2195 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2196 {0: 'a', 1: 'b', 2: ''}),
2197 ("ab", 3)
2198 )
2199
2200 self.assertRaises(UnicodeDecodeError,
2201 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2202 {0: 'a', 1: 'b'}
2203 )
2204
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002205 self.assertRaises(UnicodeDecodeError,
2206 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2207 {0: 'a', 1: 'b', 2: None}
2208 )
2209
2210 # Issue #14850
2211 self.assertRaises(UnicodeDecodeError,
2212 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2213 {0: 'a', 1: 'b', 2: '\ufffe'}
2214 )
2215
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002216 self.assertEqual(
2217 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2218 {0: 'a', 1: 'b'}),
2219 ("ab\ufffd", 3)
2220 )
2221
2222 self.assertEqual(
2223 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2224 {0: 'a', 1: 'b', 2: None}),
2225 ("ab\ufffd", 3)
2226 )
2227
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002228 # Issue #14850
2229 self.assertEqual(
2230 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2231 {0: 'a', 1: 'b', 2: '\ufffe'}),
2232 ("ab\ufffd", 3)
2233 )
2234
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002235 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002236 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2237 {0: 'a', 1: 'b'}),
2238 ("ab\\x02", 3)
2239 )
2240
2241 self.assertEqual(
2242 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2243 {0: 'a', 1: 'b', 2: None}),
2244 ("ab\\x02", 3)
2245 )
2246
2247 # Issue #14850
2248 self.assertEqual(
2249 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2250 {0: 'a', 1: 'b', 2: '\ufffe'}),
2251 ("ab\\x02", 3)
2252 )
2253
2254 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002255 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2256 {0: 'a', 1: 'b'}),
2257 ("ab", 3)
2258 )
2259
2260 self.assertEqual(
2261 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2262 {0: 'a', 1: 'b', 2: None}),
2263 ("ab", 3)
2264 )
2265
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002266 # Issue #14850
2267 self.assertEqual(
2268 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2269 {0: 'a', 1: 'b', 2: '\ufffe'}),
2270 ("ab", 3)
2271 )
2272
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002273 allbytes = bytes(range(256))
2274 self.assertEqual(
2275 codecs.charmap_decode(allbytes, "ignore", {}),
2276 ("", len(allbytes))
2277 )
2278
2279 def test_decode_with_int2int_map(self):
2280 a = ord('a')
2281 b = ord('b')
2282 c = ord('c')
2283
2284 self.assertEqual(
2285 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2286 {0: a, 1: b, 2: c}),
2287 ("abc", 3)
2288 )
2289
2290 # Issue #15379
2291 self.assertEqual(
2292 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2293 {0: 0x10FFFF, 1: b, 2: c}),
2294 ("\U0010FFFFbc", 3)
2295 )
2296
Antoine Pitroua1f76552012-09-23 20:00:04 +02002297 self.assertEqual(
2298 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2299 {0: sys.maxunicode, 1: b, 2: c}),
2300 (chr(sys.maxunicode) + "bc", 3)
2301 )
2302
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002303 self.assertRaises(TypeError,
2304 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002305 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002306 )
2307
2308 self.assertRaises(UnicodeDecodeError,
2309 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2310 {0: a, 1: b},
2311 )
2312
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002313 self.assertRaises(UnicodeDecodeError,
2314 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2315 {0: a, 1: b, 2: 0xFFFE},
2316 )
2317
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002318 self.assertEqual(
2319 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2320 {0: a, 1: b}),
2321 ("ab\ufffd", 3)
2322 )
2323
2324 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002325 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2326 {0: a, 1: b, 2: 0xFFFE}),
2327 ("ab\ufffd", 3)
2328 )
2329
2330 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002331 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2332 {0: a, 1: b}),
2333 ("ab\\x02", 3)
2334 )
2335
2336 self.assertEqual(
2337 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2338 {0: a, 1: b, 2: 0xFFFE}),
2339 ("ab\\x02", 3)
2340 )
2341
2342 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002343 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2344 {0: a, 1: b}),
2345 ("ab", 3)
2346 )
2347
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002348 self.assertEqual(
2349 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2350 {0: a, 1: b, 2: 0xFFFE}),
2351 ("ab", 3)
2352 )
2353
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002354
Thomas Wouters89f507f2006-12-13 04:49:30 +00002355class WithStmtTest(unittest.TestCase):
2356 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002357 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002358 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2359 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002360 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002361
2362 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002363 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002364 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002365 with codecs.StreamReaderWriter(f, info.streamreader,
2366 info.streamwriter, 'strict') as srw:
2367 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002368
Victor Stinnerf96418d2015-09-21 23:06:27 +02002369
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002370class TypesTest(unittest.TestCase):
2371 def test_decode_unicode(self):
2372 # Most decoders don't accept unicode input
2373 decoders = [
2374 codecs.utf_7_decode,
2375 codecs.utf_8_decode,
2376 codecs.utf_16_le_decode,
2377 codecs.utf_16_be_decode,
2378 codecs.utf_16_ex_decode,
2379 codecs.utf_32_decode,
2380 codecs.utf_32_le_decode,
2381 codecs.utf_32_be_decode,
2382 codecs.utf_32_ex_decode,
2383 codecs.latin_1_decode,
2384 codecs.ascii_decode,
2385 codecs.charmap_decode,
2386 ]
2387 if hasattr(codecs, "mbcs_decode"):
2388 decoders.append(codecs.mbcs_decode)
2389 for decoder in decoders:
2390 self.assertRaises(TypeError, decoder, "xxx")
2391
2392 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002393 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002394 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002395 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2396 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2397 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2398 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002399
Victor Stinnere3b47152011-12-09 20:49:49 +01002400 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2401 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002402 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2403 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002404
2405 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2406 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002407 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2408 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002409
Serhiy Storchakad6793772013-01-29 10:20:44 +02002410
2411class UnicodeEscapeTest(unittest.TestCase):
2412 def test_empty(self):
2413 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2414 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2415
2416 def test_raw_encode(self):
2417 encode = codecs.unicode_escape_encode
2418 for b in range(32, 127):
2419 if b != b'\\'[0]:
2420 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2421
2422 def test_raw_decode(self):
2423 decode = codecs.unicode_escape_decode
2424 for b in range(256):
2425 if b != b'\\'[0]:
2426 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2427
2428 def test_escape_encode(self):
2429 encode = codecs.unicode_escape_encode
2430 check = coding_checker(self, encode)
2431 check('\t', br'\t')
2432 check('\n', br'\n')
2433 check('\r', br'\r')
2434 check('\\', br'\\')
2435 for b in range(32):
2436 if chr(b) not in '\t\n\r':
2437 check(chr(b), ('\\x%02x' % b).encode())
2438 for b in range(127, 256):
2439 check(chr(b), ('\\x%02x' % b).encode())
2440 check('\u20ac', br'\u20ac')
2441 check('\U0001d120', br'\U0001d120')
2442
2443 def test_escape_decode(self):
2444 decode = codecs.unicode_escape_decode
2445 check = coding_checker(self, decode)
2446 check(b"[\\\n]", "[]")
2447 check(br'[\"]', '["]')
2448 check(br"[\']", "[']")
2449 check(br"[\\]", r"[\]")
2450 check(br"[\a]", "[\x07]")
2451 check(br"[\b]", "[\x08]")
2452 check(br"[\t]", "[\x09]")
2453 check(br"[\n]", "[\x0a]")
2454 check(br"[\v]", "[\x0b]")
2455 check(br"[\f]", "[\x0c]")
2456 check(br"[\r]", "[\x0d]")
2457 check(br"[\7]", "[\x07]")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002458 check(br"[\78]", "[\x078]")
2459 check(br"[\41]", "[!]")
2460 check(br"[\418]", "[!8]")
2461 check(br"[\101]", "[A]")
2462 check(br"[\1010]", "[A0]")
2463 check(br"[\x41]", "[A]")
2464 check(br"[\x410]", "[A0]")
2465 check(br"\u20ac", "\u20ac")
2466 check(br"\U0001d120", "\U0001d120")
R David Murray110b6fe2016-09-08 15:34:08 -04002467 for i in range(97, 123):
2468 b = bytes([i])
2469 if b not in b'abfnrtuvx':
2470 with self.assertWarns(DeprecationWarning):
2471 check(b"\\" + b, "\\" + chr(i))
2472 if b.upper() not in b'UN':
2473 with self.assertWarns(DeprecationWarning):
2474 check(b"\\" + b.upper(), "\\" + chr(i-32))
2475 with self.assertWarns(DeprecationWarning):
2476 check(br"\8", "\\8")
2477 with self.assertWarns(DeprecationWarning):
2478 check(br"\9", "\\9")
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03002479 with self.assertWarns(DeprecationWarning):
2480 check(b"\\\xfa", "\\\xfa")
Serhiy Storchakad6793772013-01-29 10:20:44 +02002481
2482 def test_decode_errors(self):
2483 decode = codecs.unicode_escape_decode
2484 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2485 for i in range(d):
2486 self.assertRaises(UnicodeDecodeError, decode,
2487 b"\\" + c + b"0"*i)
2488 self.assertRaises(UnicodeDecodeError, decode,
2489 b"[\\" + c + b"0"*i + b"]")
2490 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2491 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2492 self.assertEqual(decode(data, "replace"),
2493 ("[\ufffd]\ufffd", len(data)))
2494 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2495 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2496 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2497
2498
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002499class RawUnicodeEscapeTest(unittest.TestCase):
2500 def test_empty(self):
2501 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2502 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2503
2504 def test_raw_encode(self):
2505 encode = codecs.raw_unicode_escape_encode
2506 for b in range(256):
2507 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2508
2509 def test_raw_decode(self):
2510 decode = codecs.raw_unicode_escape_decode
2511 for b in range(256):
2512 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2513
2514 def test_escape_encode(self):
2515 encode = codecs.raw_unicode_escape_encode
2516 check = coding_checker(self, encode)
2517 for b in range(256):
2518 if b not in b'uU':
2519 check('\\' + chr(b), b'\\' + bytes([b]))
2520 check('\u20ac', br'\u20ac')
2521 check('\U0001d120', br'\U0001d120')
2522
2523 def test_escape_decode(self):
2524 decode = codecs.raw_unicode_escape_decode
2525 check = coding_checker(self, decode)
2526 for b in range(256):
2527 if b not in b'uU':
2528 check(b'\\' + bytes([b]), '\\' + chr(b))
2529 check(br"\u20ac", "\u20ac")
2530 check(br"\U0001d120", "\U0001d120")
2531
2532 def test_decode_errors(self):
2533 decode = codecs.raw_unicode_escape_decode
2534 for c, d in (b'u', 4), (b'U', 4):
2535 for i in range(d):
2536 self.assertRaises(UnicodeDecodeError, decode,
2537 b"\\" + c + b"0"*i)
2538 self.assertRaises(UnicodeDecodeError, decode,
2539 b"[\\" + c + b"0"*i + b"]")
2540 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2541 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2542 self.assertEqual(decode(data, "replace"),
2543 ("[\ufffd]\ufffd", len(data)))
2544 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2545 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2546 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2547
2548
Berker Peksag4a72a7b2016-09-16 17:31:06 +03002549class EscapeEncodeTest(unittest.TestCase):
2550
2551 def test_escape_encode(self):
2552 tests = [
2553 (b'', (b'', 0)),
2554 (b'foobar', (b'foobar', 6)),
2555 (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2556 (b'a\'b', (b"a\\'b", 3)),
2557 (b'b\\c', (b'b\\\\c', 3)),
2558 (b'c\nd', (b'c\\nd', 3)),
2559 (b'd\re', (b'd\\re', 3)),
2560 (b'f\x7fg', (b'f\\x7fg', 3)),
2561 ]
2562 for data, output in tests:
2563 with self.subTest(data=data):
2564 self.assertEqual(codecs.escape_encode(data), output)
2565 self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2566 self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2567
2568
Martin v. Löwis43c57782009-05-10 08:15:24 +00002569class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002570
2571 def test_utf8(self):
2572 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002573 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002574 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002575 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002576 b"foo\x80bar")
2577 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002578 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002579 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002580 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002581 b"\xed\xb0\x80")
2582
2583 def test_ascii(self):
2584 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002585 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002586 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002587 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002588 b"foo\x80bar")
2589
2590 def test_charmap(self):
2591 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002592 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002593 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002594 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002595 b"foo\xa5bar")
2596
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002597 def test_latin1(self):
2598 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002599 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002600 b"\xe4\xeb\xef\xf6\xfc")
2601
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002602
Victor Stinner3fed0872010-05-22 02:16:27 +00002603class BomTest(unittest.TestCase):
2604 def test_seek0(self):
2605 data = "1234567890"
2606 tests = ("utf-16",
2607 "utf-16-le",
2608 "utf-16-be",
2609 "utf-32",
2610 "utf-32-le",
2611 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002612 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002613 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002614 # Check if the BOM is written only once
2615 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002616 f.write(data)
2617 f.write(data)
2618 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002619 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002620 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002621 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002622
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002623 # Check that the BOM is written after a seek(0)
2624 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2625 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002626 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002627 f.seek(0)
2628 f.write(data)
2629 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002630 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002631
2632 # (StreamWriter) Check that the BOM is written after a seek(0)
2633 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002634 f.writer.write(data[0])
2635 self.assertNotEqual(f.writer.tell(), 0)
2636 f.writer.seek(0)
2637 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002638 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002639 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002640
Victor Stinner05010702011-05-27 16:50:40 +02002641 # Check that the BOM is not written after a seek() at a position
2642 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002643 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2644 f.write(data)
2645 f.seek(f.tell())
2646 f.write(data)
2647 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002648 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002649
Victor Stinner05010702011-05-27 16:50:40 +02002650 # (StreamWriter) Check that the BOM is not written after a seek()
2651 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002652 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002653 f.writer.write(data)
2654 f.writer.seek(f.writer.tell())
2655 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002656 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002657 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002658
Victor Stinner3fed0872010-05-22 02:16:27 +00002659
Georg Brandl02524622010-12-02 18:06:51 +00002660bytes_transform_encodings = [
2661 "base64_codec",
2662 "uu_codec",
2663 "quopri_codec",
2664 "hex_codec",
2665]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002666
2667transform_aliases = {
2668 "base64_codec": ["base64", "base_64"],
2669 "uu_codec": ["uu"],
2670 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2671 "hex_codec": ["hex"],
2672 "rot_13": ["rot13"],
2673}
2674
Georg Brandl02524622010-12-02 18:06:51 +00002675try:
2676 import zlib
2677except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002678 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002679else:
2680 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002681 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002682try:
2683 import bz2
2684except ImportError:
2685 pass
2686else:
2687 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002688 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002689
Victor Stinnerf96418d2015-09-21 23:06:27 +02002690
Georg Brandl02524622010-12-02 18:06:51 +00002691class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002692
Georg Brandl02524622010-12-02 18:06:51 +00002693 def test_basics(self):
2694 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002695 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002696 with self.subTest(encoding=encoding):
2697 # generic codecs interface
2698 (o, size) = codecs.getencoder(encoding)(binput)
2699 self.assertEqual(size, len(binput))
2700 (i, size) = codecs.getdecoder(encoding)(o)
2701 self.assertEqual(size, len(o))
2702 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002703
Georg Brandl02524622010-12-02 18:06:51 +00002704 def test_read(self):
2705 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002706 with self.subTest(encoding=encoding):
2707 sin = codecs.encode(b"\x80", encoding)
2708 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2709 sout = reader.read()
2710 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002711
2712 def test_readline(self):
2713 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002714 with self.subTest(encoding=encoding):
2715 sin = codecs.encode(b"\x80", encoding)
2716 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2717 sout = reader.readline()
2718 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002719
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002720 def test_buffer_api_usage(self):
2721 # We check all the transform codecs accept memoryview input
2722 # for encoding and decoding
2723 # and also that they roundtrip correctly
2724 original = b"12345\x80"
2725 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002726 with self.subTest(encoding=encoding):
2727 data = original
2728 view = memoryview(data)
2729 data = codecs.encode(data, encoding)
2730 view_encoded = codecs.encode(view, encoding)
2731 self.assertEqual(view_encoded, data)
2732 view = memoryview(data)
2733 data = codecs.decode(data, encoding)
2734 self.assertEqual(data, original)
2735 view_decoded = codecs.decode(view, encoding)
2736 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002737
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002738 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002739 # Check binary -> binary codecs give a good error for str input
2740 bad_input = "bad input type"
2741 for encoding in bytes_transform_encodings:
2742 with self.subTest(encoding=encoding):
R David Murray44b548d2016-09-08 13:59:53 -04002743 fmt = (r"{!r} is not a text encoding; "
2744 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002745 msg = fmt.format(encoding)
2746 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002747 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002748 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002749
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002750 def test_text_to_binary_blacklists_text_transforms(self):
2751 # Check str.encode gives a good error message for str -> str codecs
2752 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002753 r"use codecs.encode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002754 with self.assertRaisesRegex(LookupError, msg):
2755 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002756
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002757 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002758 # Check bytes.decode and bytearray.decode give a good error
2759 # message for binary -> binary codecs
2760 data = b"encode first to ensure we meet any format restrictions"
2761 for encoding in bytes_transform_encodings:
2762 with self.subTest(encoding=encoding):
2763 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002764 fmt = (r"{!r} is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002765 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002766 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002767 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002769 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002770 bytearray(encoded_data).decode(encoding)
2771
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002772 def test_binary_to_text_blacklists_text_transforms(self):
2773 # Check str -> str codec gives a good error for binary input
2774 for bad_input in (b"immutable", bytearray(b"mutable")):
2775 with self.subTest(bad_input=bad_input):
2776 msg = (r"^'rot_13' is not a text encoding; "
R David Murray44b548d2016-09-08 13:59:53 -04002777 r"use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002778 with self.assertRaisesRegex(LookupError, msg) as failure:
2779 bad_input.decode("rot_13")
2780 self.assertIsNone(failure.exception.__cause__)
2781
Zachary Wareefa2e042013-12-30 14:54:11 -06002782 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 def test_custom_zlib_error_is_wrapped(self):
2784 # Check zlib codec gives a good error for malformed input
2785 msg = "^decoding with 'zlib_codec' codec failed"
2786 with self.assertRaisesRegex(Exception, msg) as failure:
2787 codecs.decode(b"hello", "zlib_codec")
2788 self.assertIsInstance(failure.exception.__cause__,
2789 type(failure.exception))
2790
2791 def test_custom_hex_error_is_wrapped(self):
2792 # Check hex codec gives a good error for malformed input
2793 msg = "^decoding with 'hex_codec' codec failed"
2794 with self.assertRaisesRegex(Exception, msg) as failure:
2795 codecs.decode(b"hello", "hex_codec")
2796 self.assertIsInstance(failure.exception.__cause__,
2797 type(failure.exception))
2798
2799 # Unfortunately, the bz2 module throws OSError, which the codec
2800 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002801
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002802 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2803 def test_aliases(self):
2804 for codec_name, aliases in transform_aliases.items():
2805 expected_name = codecs.lookup(codec_name).name
2806 for alias in aliases:
2807 with self.subTest(alias=alias):
2808 info = codecs.lookup(alias)
2809 self.assertEqual(info.name, expected_name)
2810
Martin Panter06171bd2015-09-12 00:34:28 +00002811 def test_quopri_stateless(self):
2812 # Should encode with quotetabs=True
2813 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2814 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2815 # But should still support unescaped tabs and spaces
2816 unescaped = b"space tab eol\n"
2817 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2818
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002819 def test_uu_invalid(self):
2820 # Missing "begin" line
2821 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2822
Nick Coghlan8b097b42013-11-13 23:49:21 +10002823
2824# The codec system tries to wrap exceptions in order to ensure the error
2825# mentions the operation being performed and the codec involved. We
2826# currently *only* want this to happen for relatively stateless
2827# exceptions, where the only significant information they contain is their
2828# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002829
2830# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002831# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002832_TEST_CODECS = {}
2833
2834def _get_test_codec(codec_name):
2835 return _TEST_CODECS.get(codec_name)
2836codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2837
Nick Coghlan8fad1672014-09-15 23:50:44 +12002838try:
2839 # Issue #22166: Also need to clear the internal cache in CPython
2840 from _codecs import _forget_codec
2841except ImportError:
2842 def _forget_codec(codec_name):
2843 pass
2844
2845
Nick Coghlan8b097b42013-11-13 23:49:21 +10002846class ExceptionChainingTest(unittest.TestCase):
2847
2848 def setUp(self):
2849 # There's no way to unregister a codec search function, so we just
2850 # ensure we render this one fairly harmless after the test
2851 # case finishes by using the test case repr as the codec name
2852 # The codecs module normalizes codec names, although this doesn't
2853 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002854 # We also make sure we use a truly unique id for the custom codec
2855 # to avoid issues with the codec cache when running these tests
2856 # multiple times (e.g. when hunting for refleaks)
2857 unique_id = repr(self) + str(id(self))
2858 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2859
2860 # We store the object to raise on the instance because of a bad
2861 # interaction between the codec caching (which means we can't
2862 # recreate the codec entry) and regrtest refleak hunting (which
2863 # runs the same test instance multiple times). This means we
2864 # need to ensure the codecs call back in to the instance to find
2865 # out which exception to raise rather than binding them in a
2866 # closure to an object that may change on the next run
2867 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002868
Nick Coghlan4e553e22013-11-16 00:35:34 +10002869 def tearDown(self):
2870 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002871 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2872 encodings._cache.pop(self.codec_name, None)
2873 try:
2874 _forget_codec(self.codec_name)
2875 except KeyError:
2876 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002877
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002878 def set_codec(self, encode, decode):
2879 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002880 name=self.codec_name)
2881 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002882
2883 @contextlib.contextmanager
2884 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002885 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002886 operation, self.codec_name, exc_type.__name__, msg)
2887 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2888 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002889 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002890 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002891
2892 def raise_obj(self, *args, **kwds):
2893 # Helper to dynamically change the object raised by a test codec
2894 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002895
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002896 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002897 self.obj_to_raise = obj_to_raise
2898 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002899 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002900 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002901 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002902 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002903 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002904 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002905 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002906 codecs.decode(b"bytes input", self.codec_name)
2907
2908 def test_raise_by_type(self):
2909 self.check_wrapped(RuntimeError, "")
2910
2911 def test_raise_by_value(self):
2912 msg = "This should be wrapped"
2913 self.check_wrapped(RuntimeError(msg), msg)
2914
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002915 def test_raise_grandchild_subclass_exact_size(self):
2916 msg = "This should be wrapped"
2917 class MyRuntimeError(RuntimeError):
2918 __slots__ = ()
2919 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2920
2921 def test_raise_subclass_with_weakref_support(self):
2922 msg = "This should be wrapped"
2923 class MyRuntimeError(RuntimeError):
2924 pass
2925 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2926
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002927 def check_not_wrapped(self, obj_to_raise, msg):
2928 def raise_obj(*args, **kwds):
2929 raise obj_to_raise
2930 self.set_codec(raise_obj, raise_obj)
2931 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002932 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002933 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002934 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002935 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002936 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002937 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002938 codecs.decode(b"bytes input", self.codec_name)
2939
2940 def test_init_override_is_not_wrapped(self):
2941 class CustomInit(RuntimeError):
2942 def __init__(self):
2943 pass
2944 self.check_not_wrapped(CustomInit, "")
2945
2946 def test_new_override_is_not_wrapped(self):
2947 class CustomNew(RuntimeError):
2948 def __new__(cls):
2949 return super().__new__(cls)
2950 self.check_not_wrapped(CustomNew, "")
2951
2952 def test_instance_attribute_is_not_wrapped(self):
2953 msg = "This should NOT be wrapped"
2954 exc = RuntimeError(msg)
2955 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002956 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002957
2958 def test_non_str_arg_is_not_wrapped(self):
2959 self.check_not_wrapped(RuntimeError(1), "1")
2960
2961 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002962 msg_re = r"^\('a', 'b', 'c'\)$"
2963 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002964
2965 # http://bugs.python.org/issue19609
2966 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002967 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002968 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002969 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002970 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002971 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002972 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002973 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002974 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002975 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002976 codecs.decode(b"bytes input", self.codec_name)
2977
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002978 def test_unflagged_non_text_codec_handling(self):
2979 # The stdlib non-text codecs are now marked so they're
2980 # pre-emptively skipped by the text model related methods
2981 # However, third party codecs won't be flagged, so we still make
2982 # sure the case where an inappropriate output type is produced is
2983 # handled appropriately
2984 def encode_to_str(*args, **kwds):
2985 return "not bytes!", 0
2986 def decode_to_bytes(*args, **kwds):
2987 return b"not str!", 0
2988 self.set_codec(encode_to_str, decode_to_bytes)
2989 # No input or output type checks on the codecs module functions
2990 encoded = codecs.encode(None, self.codec_name)
2991 self.assertEqual(encoded, "not bytes!")
2992 decoded = codecs.decode(None, self.codec_name)
2993 self.assertEqual(decoded, b"not str!")
2994 # Text model methods should complain
2995 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
R David Murray44b548d2016-09-08 13:59:53 -04002996 r"use codecs.encode\(\) to encode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002997 msg = fmt.format(self.codec_name)
2998 with self.assertRaisesRegex(TypeError, msg):
2999 "str_input".encode(self.codec_name)
3000 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
R David Murray44b548d2016-09-08 13:59:53 -04003001 r"use codecs.decode\(\) to decode to arbitrary types$")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003002 msg = fmt.format(self.codec_name)
3003 with self.assertRaisesRegex(TypeError, msg):
3004 b"bytes input".decode(self.codec_name)
3005
Nick Coghlanfdf239a2013-10-03 00:43:22 +10003006
Georg Brandl02524622010-12-02 18:06:51 +00003007
Victor Stinner62be4fb2011-10-18 21:46:37 +02003008@unittest.skipUnless(sys.platform == 'win32',
3009 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02003010class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003011 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02003012 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02003013
Victor Stinner3a50e702011-10-18 21:21:00 +02003014 def test_invalid_code_page(self):
3015 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3016 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02003017 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3018 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02003019
3020 def test_code_page_name(self):
3021 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3022 codecs.code_page_encode, 932, '\xff')
3023 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003024 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003025 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01003026 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003027
3028 def check_decode(self, cp, tests):
3029 for raw, errors, expected in tests:
3030 if expected is not None:
3031 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01003032 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003033 except UnicodeDecodeError as err:
3034 self.fail('Unable to decode %a from "cp%s" with '
3035 'errors=%r: %s' % (raw, cp, errors, err))
3036 self.assertEqual(decoded[0], expected,
3037 '%a.decode("cp%s", %r)=%a != %a'
3038 % (raw, cp, errors, decoded[0], expected))
3039 # assert 0 <= decoded[1] <= len(raw)
3040 self.assertGreaterEqual(decoded[1], 0)
3041 self.assertLessEqual(decoded[1], len(raw))
3042 else:
3043 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003044 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003045
3046 def check_encode(self, cp, tests):
3047 for text, errors, expected in tests:
3048 if expected is not None:
3049 try:
3050 encoded = codecs.code_page_encode(cp, text, errors)
3051 except UnicodeEncodeError as err:
3052 self.fail('Unable to encode %a to "cp%s" with '
3053 'errors=%r: %s' % (text, cp, errors, err))
3054 self.assertEqual(encoded[0], expected,
3055 '%a.encode("cp%s", %r)=%a != %a'
3056 % (text, cp, errors, encoded[0], expected))
3057 self.assertEqual(encoded[1], len(text))
3058 else:
3059 self.assertRaises(UnicodeEncodeError,
3060 codecs.code_page_encode, cp, text, errors)
3061
3062 def test_cp932(self):
3063 self.check_encode(932, (
3064 ('abc', 'strict', b'abc'),
3065 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003066 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003067 ('\xff', 'strict', None),
3068 ('[\xff]', 'ignore', b'[]'),
3069 ('[\xff]', 'replace', b'[y]'),
3070 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003071 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003072 ('[\xff]', 'namereplace',
3073 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003074 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003075 ('\udcff', 'strict', None),
3076 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3077 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003078 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003079 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003080 (b'abc', 'strict', 'abc'),
3081 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3082 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003083 (b'[\xff]', 'strict', None),
3084 (b'[\xff]', 'ignore', '[]'),
3085 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003086 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003087 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003088 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003089 (b'\x81\x00abc', 'strict', None),
3090 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003091 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003092 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003093 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003094
3095 def test_cp1252(self):
3096 self.check_encode(1252, (
3097 ('abc', 'strict', b'abc'),
3098 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3099 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003100 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003101 ('\u0141', 'strict', None),
3102 ('\u0141', 'ignore', b''),
3103 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003104 ('\udc98', 'surrogateescape', b'\x98'),
3105 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003106 ))
3107 self.check_decode(1252, (
3108 (b'abc', 'strict', 'abc'),
3109 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3110 (b'\xff', 'strict', '\xff'),
3111 ))
3112
3113 def test_cp_utf7(self):
3114 cp = 65000
3115 self.check_encode(cp, (
3116 ('abc', 'strict', b'abc'),
3117 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3118 ('\U0010ffff', 'strict', b'+2//f/w-'),
3119 ('\udc80', 'strict', b'+3IA-'),
3120 ('\ufffd', 'strict', b'+//0-'),
3121 ))
3122 self.check_decode(cp, (
3123 (b'abc', 'strict', 'abc'),
3124 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3125 (b'+2//f/w-', 'strict', '\U0010ffff'),
3126 (b'+3IA-', 'strict', '\udc80'),
3127 (b'+//0-', 'strict', '\ufffd'),
3128 # invalid bytes
3129 (b'[+/]', 'strict', '[]'),
3130 (b'[\xff]', 'strict', '[\xff]'),
3131 ))
3132
Victor Stinner3a50e702011-10-18 21:21:00 +02003133 def test_multibyte_encoding(self):
3134 self.check_decode(932, (
3135 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3136 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3137 ))
3138 self.check_decode(self.CP_UTF8, (
3139 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3140 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3141 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003142 self.check_encode(self.CP_UTF8, (
3143 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3144 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3145 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003146
3147 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003148 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3149 self.assertEqual(decoded, ('', 0))
3150
Victor Stinner3a50e702011-10-18 21:21:00 +02003151 decoded = codecs.code_page_decode(932,
3152 b'\xe9\x80\xe9', 'strict',
3153 False)
3154 self.assertEqual(decoded, ('\u9a3e', 2))
3155
3156 decoded = codecs.code_page_decode(932,
3157 b'\xe9\x80\xe9\x80', 'strict',
3158 False)
3159 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3160
3161 decoded = codecs.code_page_decode(932,
3162 b'abc', 'strict',
3163 False)
3164 self.assertEqual(decoded, ('abc', 3))
3165
Steve Dowerf5aba582016-09-06 19:42:27 -07003166 def test_mbcs_alias(self):
3167 # Check that looking up our 'default' codepage will return
3168 # mbcs when we don't have a more specific one available
3169 import _bootlocale
3170 def _get_fake_codepage(*a):
3171 return 'cp123'
3172 old_getpreferredencoding = _bootlocale.getpreferredencoding
3173 _bootlocale.getpreferredencoding = _get_fake_codepage
3174 try:
3175 codec = codecs.lookup('cp123')
3176 self.assertEqual(codec.name, 'mbcs')
3177 finally:
3178 _bootlocale.getpreferredencoding = old_getpreferredencoding
3179
Victor Stinner3a50e702011-10-18 21:21:00 +02003180
Victor Stinnerf96418d2015-09-21 23:06:27 +02003181class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003182 def test_encode(self):
3183 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3184
3185 def test_encode_error(self):
3186 for data, error_handler, expected in (
3187 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3188 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3189 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003190 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3191 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003192 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3193 ):
3194 with self.subTest(data=data, error_handler=error_handler,
3195 expected=expected):
3196 self.assertEqual(data.encode('ascii', error_handler),
3197 expected)
3198
3199 def test_encode_surrogateescape_error(self):
3200 with self.assertRaises(UnicodeEncodeError):
3201 # the first character can be decoded, but not the second
3202 '\udc80\xff'.encode('ascii', 'surrogateescape')
3203
Victor Stinnerf96418d2015-09-21 23:06:27 +02003204 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003205 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3206
3207 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003208 for data, error_handler, expected in (
3209 (b'[\x80\xff]', 'ignore', '[]'),
3210 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3211 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3212 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3213 ):
3214 with self.subTest(data=data, error_handler=error_handler,
3215 expected=expected):
3216 self.assertEqual(data.decode('ascii', error_handler),
3217 expected)
3218
3219
Victor Stinnerc3713e92015-09-29 12:32:13 +02003220class Latin1Test(unittest.TestCase):
3221 def test_encode(self):
3222 for data, expected in (
3223 ('abc', b'abc'),
3224 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3225 ):
3226 with self.subTest(data=data, expected=expected):
3227 self.assertEqual(data.encode('latin1'), expected)
3228
3229 def test_encode_errors(self):
3230 for data, error_handler, expected in (
3231 ('[\u20ac\udc80]', 'ignore', b'[]'),
3232 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003233 ('[\u20ac\U000abcde]', 'backslashreplace',
3234 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003235 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3236 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3237 ):
3238 with self.subTest(data=data, error_handler=error_handler,
3239 expected=expected):
3240 self.assertEqual(data.encode('latin1', error_handler),
3241 expected)
3242
3243 def test_encode_surrogateescape_error(self):
3244 with self.assertRaises(UnicodeEncodeError):
3245 # the first character can be decoded, but not the second
3246 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3247
3248 def test_decode(self):
3249 for data, expected in (
3250 (b'abc', 'abc'),
3251 (b'[\x80\xff]', '[\x80\xff]'),
3252 ):
3253 with self.subTest(data=data, expected=expected):
3254 self.assertEqual(data.decode('latin1'), expected)
3255
3256
Fred Drake2e2be372001-09-20 21:33:42 +00003257if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003258 unittest.main()