blob: 825a7dd95f7002d3905a7b0388759cbe7c2e0ab6 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
Nick Coghlanc72e4e62013-11-22 22:39:36 +10007import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01008
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Antoine Pitrou00b2c862011-10-05 13:01:41 +020011try:
12 import ctypes
13except ImportError:
14 ctypes = None
15 SIZEOF_WCHAR_T = -1
16else:
17 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000018
Serhiy Storchakad6793772013-01-29 10:20:44 +020019def coding_checker(self, coder):
20 def check(input, expect):
21 self.assertEqual(coder(input), (expect, len(input)))
22 return check
23
Victor Stinnerf96418d2015-09-21 23:06:27 +020024
Walter Dörwald69652032004-09-07 20:24:22 +000025class Queue(object):
26 """
27 queue: write bytes at one end, read bytes from the other end
28 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000029 def __init__(self, buffer):
30 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000031
32 def write(self, chars):
33 self._buffer += chars
34
35 def read(self, size=-1):
36 if size<0:
37 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000038 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000039 return s
40 else:
41 s = self._buffer[:size]
42 self._buffer = self._buffer[size:]
43 return s
44
Victor Stinnerf96418d2015-09-21 23:06:27 +020045
Walter Dörwald3abcb012007-04-16 22:10:50 +000046class MixInCheckStateHandling:
47 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000048 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000049 d = codecs.getincrementaldecoder(encoding)()
50 part1 = d.decode(s[:i])
51 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000052 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 # Check that the condition stated in the documentation for
54 # IncrementalDecoder.getstate() holds
55 if not state[1]:
56 # reset decoder to the default state without anything buffered
57 d.setstate((state[0][:0], 0))
58 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000059 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000060 # The decoder must return to the same state
61 self.assertEqual(state, d.getstate())
62 # Create a new decoder and set it to the state
63 # we extracted from the old one
64 d = codecs.getincrementaldecoder(encoding)()
65 d.setstate(state)
66 part2 = d.decode(s[i:], True)
67 self.assertEqual(u, part1+part2)
68
69 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000070 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000071 d = codecs.getincrementalencoder(encoding)()
72 part1 = d.encode(u[:i])
73 state = d.getstate()
74 d = codecs.getincrementalencoder(encoding)()
75 d.setstate(state)
76 part2 = d.encode(u[i:], True)
77 self.assertEqual(s, part1+part2)
78
Victor Stinnerf96418d2015-09-21 23:06:27 +020079
Ezio Melotti5d3dba02013-01-11 06:02:07 +020080class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000082 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000083 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000084 # the StreamReader and check that the results equal the appropriate
85 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020087 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000088 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000089 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000091 result += r.read()
92 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000096
Martin Panter7462b6492015-11-02 03:37:02 +000097 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +000098 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000099 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 self.assertEqual(result, partialresult)
103 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 self.assertEqual(d.decode(b"", True), "")
105 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000106
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000109 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 self.assertEqual(result, partialresult)
113 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 self.assertEqual(d.decode(b"", True), "")
115 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000116
117 # check iterdecode()
118 encoded = input.encode(self.encoding)
119 self.assertEqual(
120 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000121 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000122 )
123
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 def test_readline(self):
125 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000126 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 return codecs.getreader(self.encoding)(stream)
128
Walter Dörwaldca199432006-03-06 22:39:12 +0000129 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200130 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 lines = []
132 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 if not line:
135 break
136 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
140 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
141 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000142 self.assertEqual(readalllines(s, True), sexpected)
143 self.assertEqual(readalllines(s, False), sexpectednoends)
144 self.assertEqual(readalllines(s, True, 10), sexpected)
145 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000146
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200147 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148 # Test long lines (multiple calls to read() in readline())
149 vw = []
150 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200151 for (i, lineend) in enumerate(lineends):
152 vw.append((i*200+200)*"\u3042" + lineend)
153 vwo.append((i*200+200)*"\u3042")
154 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
155 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000156
157 # Test lines where the first read might end with \r, so the
158 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200160 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000162 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000163 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000164 self.assertEqual(
165 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200168 self.assertEqual(
169 reader.readline(keepends=True),
170 "xxx\n",
171 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000173 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 self.assertEqual(
175 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000176 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200178 self.assertEqual(
179 reader.readline(keepends=False),
180 "xxx",
181 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000182
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200183 def test_mixed_readline_and_read(self):
184 lines = ["Humpty Dumpty sat on a wall,\n",
185 "Humpty Dumpty had a great fall.\r\n",
186 "All the king's horses and all the king's men\r",
187 "Couldn't put Humpty together again."]
188 data = ''.join(lines)
189 def getreader():
190 stream = io.BytesIO(data.encode(self.encoding))
191 return codecs.getreader(self.encoding)(stream)
192
193 # Issue #8260: Test readline() followed by read()
194 f = getreader()
195 self.assertEqual(f.readline(), lines[0])
196 self.assertEqual(f.read(), ''.join(lines[1:]))
197 self.assertEqual(f.read(), '')
198
199 # Issue #16636: Test readline() followed by readlines()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.readlines(), lines[1:])
203 self.assertEqual(f.read(), '')
204
205 # Test read() followed by read()
206 f = getreader()
207 self.assertEqual(f.read(size=40, chars=5), data[:5])
208 self.assertEqual(f.read(), data[5:])
209 self.assertEqual(f.read(), '')
210
211 # Issue #12446: Test read() followed by readlines()
212 f = getreader()
213 self.assertEqual(f.read(size=40, chars=5), data[:5])
214 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
215 self.assertEqual(f.read(), '')
216
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000217 def test_bug1175396(self):
218 s = [
219 '<%!--===================================================\r\n',
220 ' BLOG index page: show recent articles,\r\n',
221 ' today\'s articles, or articles of a specific date.\r\n',
222 '========================================================--%>\r\n',
223 '<%@inputencoding="ISO-8859-1"%>\r\n',
224 '<%@pagetemplate=TEMPLATE.y%>\r\n',
225 '<%@import=import frog.util, frog%>\r\n',
226 '<%@import=import frog.objects%>\r\n',
227 '<%@import=from frog.storageerrors import StorageError%>\r\n',
228 '<%\r\n',
229 '\r\n',
230 'import logging\r\n',
231 'log=logging.getLogger("Snakelets.logger")\r\n',
232 '\r\n',
233 '\r\n',
234 'user=self.SessionCtx.user\r\n',
235 'storageEngine=self.SessionCtx.storageEngine\r\n',
236 '\r\n',
237 '\r\n',
238 'def readArticlesFromDate(date, count=None):\r\n',
239 ' entryids=storageEngine.listBlogEntries(date)\r\n',
240 ' entryids.reverse() # descending\r\n',
241 ' if count:\r\n',
242 ' entryids=entryids[:count]\r\n',
243 ' try:\r\n',
244 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
245 ' except StorageError,x:\r\n',
246 ' log.error("Error loading articles: "+str(x))\r\n',
247 ' self.abort("cannot load articles")\r\n',
248 '\r\n',
249 'showdate=None\r\n',
250 '\r\n',
251 'arg=self.Request.getArg()\r\n',
252 'if arg=="today":\r\n',
253 ' #-------------------- TODAY\'S ARTICLES\r\n',
254 ' self.write("<h2>Today\'s articles</h2>")\r\n',
255 ' showdate = frog.util.isodatestr() \r\n',
256 ' entries = readArticlesFromDate(showdate)\r\n',
257 'elif arg=="active":\r\n',
258 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
259 ' self.Yredirect("active.y")\r\n',
260 'elif arg=="login":\r\n',
261 ' #-------------------- LOGIN PAGE redirect\r\n',
262 ' self.Yredirect("login.y")\r\n',
263 'elif arg=="date":\r\n',
264 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
265 ' showdate = self.Request.getParameter("date")\r\n',
266 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
267 ' entries = readArticlesFromDate(showdate)\r\n',
268 'else:\r\n',
269 ' #-------------------- RECENT ARTICLES\r\n',
270 ' self.write("<h2>Recent articles</h2>")\r\n',
271 ' dates=storageEngine.listBlogEntryDates()\r\n',
272 ' if dates:\r\n',
273 ' entries=[]\r\n',
274 ' SHOWAMOUNT=10\r\n',
275 ' for showdate in dates:\r\n',
276 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
277 ' if len(entries)>=SHOWAMOUNT:\r\n',
278 ' break\r\n',
279 ' \r\n',
280 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000281 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200282 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000283 for (i, line) in enumerate(reader):
284 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000285
286 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000287 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200288 writer = codecs.getwriter(self.encoding)(q)
289 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000290
291 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000292 writer.write("foo\r")
293 self.assertEqual(reader.readline(keepends=False), "foo")
294 writer.write("\nbar\r")
295 self.assertEqual(reader.readline(keepends=False), "")
296 self.assertEqual(reader.readline(keepends=False), "bar")
297 writer.write("baz")
298 self.assertEqual(reader.readline(keepends=False), "baz")
299 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000300
301 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000302 writer.write("foo\r")
303 self.assertEqual(reader.readline(keepends=True), "foo\r")
304 writer.write("\nbar\r")
305 self.assertEqual(reader.readline(keepends=True), "\n")
306 self.assertEqual(reader.readline(keepends=True), "bar\r")
307 writer.write("baz")
308 self.assertEqual(reader.readline(keepends=True), "baz")
309 self.assertEqual(reader.readline(keepends=True), "")
310 writer.write("foo\r\n")
311 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000312
Walter Dörwald9fa09462005-01-10 12:01:39 +0000313 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000314 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
315 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
316 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000317
318 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000319 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200320 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000321 self.assertEqual(reader.readline(), s1)
322 self.assertEqual(reader.readline(), s2)
323 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000324 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000325
326 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
328 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
329 s3 = "stillokay:bbbbxx\r\n"
330 s4 = "broken!!!!badbad\r\n"
331 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000332
333 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000334 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200335 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336 self.assertEqual(reader.readline(), s1)
337 self.assertEqual(reader.readline(), s2)
338 self.assertEqual(reader.readline(), s3)
339 self.assertEqual(reader.readline(), s4)
340 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000341 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000342
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200343 ill_formed_sequence_replace = "\ufffd"
344
345 def test_lone_surrogates(self):
346 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
347 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
348 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200349 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
350 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200351 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
352 "[&#56448;]".encode(self.encoding))
353 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
354 "[]".encode(self.encoding))
355 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
356 "[?]".encode(self.encoding))
357
Victor Stinner01ada392015-10-01 21:54:51 +0200358 # sequential surrogate characters
359 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
360 "[]".encode(self.encoding))
361 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
362 "[??]".encode(self.encoding))
363
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200364 bom = "".encode(self.encoding)
365 for before, after in [("\U00010fff", "A"), ("[", "]"),
366 ("A", "\U00010fff")]:
367 before_sequence = before.encode(self.encoding)[len(bom):]
368 after_sequence = after.encode(self.encoding)[len(bom):]
369 test_string = before + "\uDC80" + after
370 test_sequence = (bom + before_sequence +
371 self.ill_formed_sequence + after_sequence)
372 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
373 self.encoding)
374 self.assertEqual(test_string.encode(self.encoding,
375 "surrogatepass"),
376 test_sequence)
377 self.assertEqual(test_sequence.decode(self.encoding,
378 "surrogatepass"),
379 test_string)
380 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
381 before + after)
382 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
383 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200384 backslashreplace = ''.join('\\x%02x' % b
385 for b in self.ill_formed_sequence)
386 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
387 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388
Victor Stinnerf96418d2015-09-21 23:06:27 +0200389
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200390class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000391 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200392 if sys.byteorder == 'little':
393 ill_formed_sequence = b"\x80\xdc\x00\x00"
394 else:
395 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000396
397 spamle = (b'\xff\xfe\x00\x00'
398 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
399 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
400 spambe = (b'\x00\x00\xfe\xff'
401 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
402 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
403
404 def test_only_one_bom(self):
405 _,_,reader,writer = codecs.lookup(self.encoding)
406 # encode some stream
407 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200408 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000409 f.write("spam")
410 f.write("spam")
411 d = s.getvalue()
412 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000413 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 # try to read it back
415 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200416 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000417 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418
419 def test_badbom(self):
420 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200425 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000426 self.assertRaises(UnicodeError, f.read)
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200430 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000431 [
432 "", # first byte of BOM read
433 "", # second byte of BOM read
434 "", # third byte of BOM read
435 "", # fourth byte of BOM read => byteorder known
436 "",
437 "",
438 "",
439 "\x00",
440 "\x00",
441 "\x00",
442 "\x00",
443 "\x00\xff",
444 "\x00\xff",
445 "\x00\xff",
446 "\x00\xff",
447 "\x00\xff\u0100",
448 "\x00\xff\u0100",
449 "\x00\xff\u0100",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200452 "\x00\xff\u0100\uffff",
453 "\x00\xff\u0100\uffff",
454 "\x00\xff\u0100\uffff",
455 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000456 ]
457 )
458
Georg Brandl791f4e12009-09-17 11:41:24 +0000459 def test_handlers(self):
460 self.assertEqual(('\ufffd', 1),
461 codecs.utf_32_decode(b'\x01', 'replace', True))
462 self.assertEqual(('', 1),
463 codecs.utf_32_decode(b'\x01', 'ignore', True))
464
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 def test_errors(self):
466 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
467 b"\xff", "strict", True)
468
469 def test_decoder_state(self):
470 self.check_state_handling_decode(self.encoding,
471 "spamspam", self.spamle)
472 self.check_state_handling_decode(self.encoding,
473 "spamspam", self.spambe)
474
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000475 def test_issue8941(self):
476 # Issue #8941: insufficient result allocation when decoding into
477 # surrogate pairs on UCS-2 builds.
478 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
479 self.assertEqual('\U00010000' * 1024,
480 codecs.utf_32_decode(encoded_le)[0])
481 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
482 self.assertEqual('\U00010000' * 1024,
483 codecs.utf_32_decode(encoded_be)[0])
484
Victor Stinnerf96418d2015-09-21 23:06:27 +0200485
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200486class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200488 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000489
490 def test_partial(self):
491 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200492 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000493 [
494 "",
495 "",
496 "",
497 "\x00",
498 "\x00",
499 "\x00",
500 "\x00",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff",
504 "\x00\xff",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100",
509 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff",
512 "\x00\xff\u0100\uffff",
513 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000514 ]
515 )
516
517 def test_simple(self):
518 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
519
520 def test_errors(self):
521 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
522 b"\xff", "strict", True)
523
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000524 def test_issue8941(self):
525 # Issue #8941: insufficient result allocation when decoding into
526 # surrogate pairs on UCS-2 builds.
527 encoded = b'\x00\x00\x01\x00' * 1024
528 self.assertEqual('\U00010000' * 1024,
529 codecs.utf_32_le_decode(encoded)[0])
530
Victor Stinnerf96418d2015-09-21 23:06:27 +0200531
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200532class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200534 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000535
536 def test_partial(self):
537 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200538 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000539 [
540 "",
541 "",
542 "",
543 "\x00",
544 "\x00",
545 "\x00",
546 "\x00",
547 "\x00\xff",
548 "\x00\xff",
549 "\x00\xff",
550 "\x00\xff",
551 "\x00\xff\u0100",
552 "\x00\xff\u0100",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200556 "\x00\xff\u0100\uffff",
557 "\x00\xff\u0100\uffff",
558 "\x00\xff\u0100\uffff",
559 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000560 ]
561 )
562
563 def test_simple(self):
564 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
565
566 def test_errors(self):
567 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
568 b"\xff", "strict", True)
569
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000570 def test_issue8941(self):
571 # Issue #8941: insufficient result allocation when decoding into
572 # surrogate pairs on UCS-2 builds.
573 encoded = b'\x00\x01\x00\x00' * 1024
574 self.assertEqual('\U00010000' * 1024,
575 codecs.utf_32_be_decode(encoded)[0])
576
577
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200578class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200580 if sys.byteorder == 'little':
581 ill_formed_sequence = b"\x80\xdc"
582 else:
583 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000585 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
586 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000587
588 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000589 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000591 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200592 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 f.write("spam")
594 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000595 d = s.getvalue()
596 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000597 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000598 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000599 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200600 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000601 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000602
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000603 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000604 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200605 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000606 self.assertRaises(UnicodeError, f.read)
607
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000608 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200609 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000610 self.assertRaises(UnicodeError, f.read)
611
Walter Dörwald69652032004-09-07 20:24:22 +0000612 def test_partial(self):
613 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000615 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000616 "", # first byte of BOM read
617 "", # second byte of BOM read => byteorder known
618 "",
619 "\x00",
620 "\x00",
621 "\x00\xff",
622 "\x00\xff",
623 "\x00\xff\u0100",
624 "\x00\xff\u0100",
625 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200626 "\x00\xff\u0100\uffff",
627 "\x00\xff\u0100\uffff",
628 "\x00\xff\u0100\uffff",
629 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 ]
631 )
632
Georg Brandl791f4e12009-09-17 11:41:24 +0000633 def test_handlers(self):
634 self.assertEqual(('\ufffd', 1),
635 codecs.utf_16_decode(b'\x01', 'replace', True))
636 self.assertEqual(('', 1),
637 codecs.utf_16_decode(b'\x01', 'ignore', True))
638
Walter Dörwalde22d3392005-11-17 08:52:34 +0000639 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000642
643 def test_decoder_state(self):
644 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000645 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000646 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000647 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000648
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000649 def test_bug691291(self):
650 # Files are always opened in binary mode, even if no binary mode was
651 # specified. This means that no automatic conversion of '\n' is done
652 # on reading and writing.
653 s1 = 'Hello\r\nworld\r\n'
654
655 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.addCleanup(support.unlink, support.TESTFN)
657 with open(support.TESTFN, 'wb') as fp:
658 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200659 with support.check_warnings(('', DeprecationWarning)):
660 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
661 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200662 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000663
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200664class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000665 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200666 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000667
668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "",
673 "\x00",
674 "\x00",
675 "\x00\xff",
676 "\x00\xff",
677 "\x00\xff\u0100",
678 "\x00\xff\u0100",
679 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200680 "\x00\xff\u0100\uffff",
681 "\x00\xff\u0100\uffff",
682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000684 ]
685 )
686
Walter Dörwalde22d3392005-11-17 08:52:34 +0000687 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200688 tests = [
689 (b'\xff', '\ufffd'),
690 (b'A\x00Z', 'A\ufffd'),
691 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
692 (b'\x00\xd8', '\ufffd'),
693 (b'\x00\xd8A', '\ufffd'),
694 (b'\x00\xd8A\x00', '\ufffdA'),
695 (b'\x00\xdcA\x00', '\ufffdA'),
696 ]
697 for raw, expected in tests:
698 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
699 raw, 'strict', True)
700 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000701
Victor Stinner53a9dd72010-12-08 22:25:45 +0000702 def test_nonbmp(self):
703 self.assertEqual("\U00010203".encode(self.encoding),
704 b'\x00\xd8\x03\xde')
705 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
706 "\U00010203")
707
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200708class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000709 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200710 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000711
712 def test_partial(self):
713 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200714 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000715 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000716 "",
717 "\x00",
718 "\x00",
719 "\x00\xff",
720 "\x00\xff",
721 "\x00\xff\u0100",
722 "\x00\xff\u0100",
723 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200724 "\x00\xff\u0100\uffff",
725 "\x00\xff\u0100\uffff",
726 "\x00\xff\u0100\uffff",
727 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000728 ]
729 )
730
Walter Dörwalde22d3392005-11-17 08:52:34 +0000731 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200732 tests = [
733 (b'\xff', '\ufffd'),
734 (b'\x00A\xff', 'A\ufffd'),
735 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
736 (b'\xd8\x00', '\ufffd'),
737 (b'\xd8\x00\xdc', '\ufffd'),
738 (b'\xd8\x00\x00A', '\ufffdA'),
739 (b'\xdc\x00\x00A', '\ufffdA'),
740 ]
741 for raw, expected in tests:
742 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
743 raw, 'strict', True)
744 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745
Victor Stinner53a9dd72010-12-08 22:25:45 +0000746 def test_nonbmp(self):
747 self.assertEqual("\U00010203".encode(self.encoding),
748 b'\xd8\x00\xde\x03')
749 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
750 "\U00010203")
751
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200752class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000753 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200754 ill_formed_sequence = b"\xed\xb2\x80"
755 ill_formed_sequence_replace = "\ufffd" * 3
Victor Stinner01ada392015-10-01 21:54:51 +0200756 BOM = b''
Walter Dörwald69652032004-09-07 20:24:22 +0000757
758 def test_partial(self):
759 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200760 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000761 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000762 "\x00",
763 "\x00",
764 "\x00\xff",
765 "\x00\xff",
766 "\x00\xff\u07ff",
767 "\x00\xff\u07ff",
768 "\x00\xff\u07ff",
769 "\x00\xff\u07ff\u0800",
770 "\x00\xff\u07ff\u0800",
771 "\x00\xff\u07ff\u0800",
772 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200773 "\x00\xff\u07ff\u0800\uffff",
774 "\x00\xff\u07ff\u0800\uffff",
775 "\x00\xff\u07ff\u0800\uffff",
776 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000777 ]
778 )
779
Walter Dörwald3abcb012007-04-16 22:10:50 +0000780 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000782 self.check_state_handling_decode(self.encoding,
783 u, u.encode(self.encoding))
784
Victor Stinner1d65d912015-10-05 13:43:50 +0200785 def test_decode_error(self):
786 for data, error_handler, expected in (
787 (b'[\x80\xff]', 'ignore', '[]'),
788 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
789 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
790 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
791 ):
792 with self.subTest(data=data, error_handler=error_handler,
793 expected=expected):
794 self.assertEqual(data.decode(self.encoding, error_handler),
795 expected)
796
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000797 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200798 super().test_lone_surrogates()
799 # not sure if this is making sense for
800 # UTF-16 and UTF-32
Victor Stinner01ada392015-10-01 21:54:51 +0200801 self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
802 self.BOM + b'[\x80]')
803
804 with self.assertRaises(UnicodeEncodeError) as cm:
805 "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
806 exc = cm.exception
807 self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000808
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000809 def test_surrogatepass_handler(self):
Victor Stinner01ada392015-10-01 21:54:51 +0200810 self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
811 self.BOM + b"abc\xed\xa0\x80def")
812 self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
813 self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
814 self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
815 self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
816
817 self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
Ezio Melottib3aedd42010-11-20 19:04:17 +0000818 "abc\ud800def")
Victor Stinner01ada392015-10-01 21:54:51 +0200819 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 "\U00010fff\uD800")
Victor Stinner01ada392015-10-01 21:54:51 +0200821
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000822 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700823 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200824 b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200825 with self.assertRaises(UnicodeDecodeError):
Victor Stinner01ada392015-10-01 21:54:51 +0200826 b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000827
Victor Stinnerf96418d2015-09-21 23:06:27 +0200828
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200829@unittest.skipUnless(sys.platform == 'win32',
830 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200831class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200832 encoding = "cp65001"
833
834 def test_encode(self):
835 tests = [
836 ('abc', 'strict', b'abc'),
837 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
838 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700839 ('\udc80', 'strict', None),
840 ('\udc80', 'ignore', b''),
841 ('\udc80', 'replace', b'?'),
842 ('\udc80', 'backslashreplace', b'\\udc80'),
843 ('\udc80', 'namereplace', b'\\udc80'),
844 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200845 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200846 for text, errors, expected in tests:
847 if expected is not None:
848 try:
849 encoded = text.encode('cp65001', errors)
850 except UnicodeEncodeError as err:
851 self.fail('Unable to encode %a to cp65001 with '
852 'errors=%r: %s' % (text, errors, err))
853 self.assertEqual(encoded, expected,
854 '%a.encode("cp65001", %r)=%a != %a'
855 % (text, errors, encoded, expected))
856 else:
857 self.assertRaises(UnicodeEncodeError,
858 text.encode, "cp65001", errors)
859
860 def test_decode(self):
861 tests = [
862 (b'abc', 'strict', 'abc'),
863 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
864 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
865 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
866 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
867 # invalid bytes
868 (b'[\xff]', 'strict', None),
869 (b'[\xff]', 'ignore', '[]'),
870 (b'[\xff]', 'replace', '[\ufffd]'),
871 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Steve Dowerf5aba582016-09-06 19:42:27 -0700872 (b'[\xed\xb2\x80]', 'strict', None),
873 (b'[\xed\xb2\x80]', 'ignore', '[]'),
874 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200875 ]
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200876 for raw, errors, expected in tests:
877 if expected is not None:
878 try:
879 decoded = raw.decode('cp65001', errors)
880 except UnicodeDecodeError as err:
881 self.fail('Unable to decode %a from cp65001 with '
882 'errors=%r: %s' % (raw, errors, err))
883 self.assertEqual(decoded, expected,
884 '%a.decode("cp65001", %r)=%a != %a'
885 % (raw, errors, decoded, expected))
886 else:
887 self.assertRaises(UnicodeDecodeError,
888 raw.decode, 'cp65001', errors)
889
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200890 def test_lone_surrogates(self):
891 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
892 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
893 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
894 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200895 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
896 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200897 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
898 b'[&#56448;]')
899 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
900 b'[\x80]')
901 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
902 b'[]')
903 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
904 b'[?]')
905
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200906 def test_surrogatepass_handler(self):
907 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
908 b"abc\xed\xa0\x80def")
909 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
910 "abc\ud800def")
911 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
912 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
913 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
914 "\U00010fff\uD800")
915 self.assertTrue(codecs.lookup_error("surrogatepass"))
916
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200917
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200918class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000919 encoding = "utf-7"
920
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300921 def test_ascii(self):
922 # Set D (directly encoded characters)
923 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
924 'abcdefghijklmnopqrstuvwxyz'
925 '0123456789'
926 '\'(),-./:?')
927 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
928 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
929 # Set O (optional direct characters)
930 set_o = ' !"#$%&*;<=>@[]^_`{|}'
931 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
932 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
933 # +
934 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
935 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
936 # White spaces
937 ws = ' \t\n\r'
938 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
939 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
940 # Other ASCII characters
941 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
942 set(set_d + set_o + '+' + ws)))
943 self.assertEqual(other_ascii.encode(self.encoding),
944 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
945 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
946
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000947 def test_partial(self):
948 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200949 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000950 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200951 'a',
952 'a',
953 'a+',
954 'a+-',
955 'a+-b',
956 'a+-b',
957 'a+-b',
958 'a+-b',
959 'a+-b',
960 'a+-b\x00',
961 'a+-b\x00c',
962 'a+-b\x00c',
963 'a+-b\x00c',
964 'a+-b\x00c',
965 'a+-b\x00c',
966 'a+-b\x00c\x80',
967 'a+-b\x00c\x80d',
968 'a+-b\x00c\x80d',
969 'a+-b\x00c\x80d',
970 'a+-b\x00c\x80d',
971 'a+-b\x00c\x80d',
972 'a+-b\x00c\x80d\u0100',
973 'a+-b\x00c\x80d\u0100e',
974 'a+-b\x00c\x80d\u0100e',
975 'a+-b\x00c\x80d\u0100e',
976 'a+-b\x00c\x80d\u0100e',
977 'a+-b\x00c\x80d\u0100e',
978 'a+-b\x00c\x80d\u0100e',
979 'a+-b\x00c\x80d\u0100e',
980 'a+-b\x00c\x80d\u0100e',
981 'a+-b\x00c\x80d\u0100e\U00010000',
982 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000983 ]
984 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000985
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300986 def test_errors(self):
987 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300988 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300989 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300990 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300991 (b'a+IK', 'a\ufffd'),
992 (b'a+IK-b', 'a\ufffdb'),
993 (b'a+IK,b', 'a\ufffdb'),
994 (b'a+IKx', 'a\u20ac\ufffd'),
995 (b'a+IKx-b', 'a\u20ac\ufffdb'),
996 (b'a+IKwgr', 'a\u20ac\ufffd'),
997 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
998 (b'a+IKwgr,', 'a\u20ac\ufffd'),
999 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1000 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1001 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1002 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1003 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1004 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1005 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001006 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1007 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001008 ]
1009 for raw, expected in tests:
1010 with self.subTest(raw=raw):
1011 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1012 raw, 'strict', True)
1013 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1014
1015 def test_nonbmp(self):
1016 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1017 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1018 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001019 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1020 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1021 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1022 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1023 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1024 b'+IKwgrNgB3KA-')
1025 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1026 '\u20ac\u20ac\U000104A0')
1027 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1028 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001029
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001030 def test_lone_surrogates(self):
1031 tests = [
1032 (b'a+2AE-b', 'a\ud801b'),
1033 (b'a+2AE\xffb', 'a\ufffdb'),
1034 (b'a+2AE', 'a\ufffd'),
1035 (b'a+2AEA-b', 'a\ufffdb'),
1036 (b'a+2AH-b', 'a\ufffdb'),
1037 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1038 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1039 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1040 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1041 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1042 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1043 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1044 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1045 ]
1046 for raw, expected in tests:
1047 with self.subTest(raw=raw):
1048 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001049
1050
Walter Dörwalde22d3392005-11-17 08:52:34 +00001051class UTF16ExTest(unittest.TestCase):
1052
1053 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001054 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001055
1056 def test_bad_args(self):
1057 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1058
1059class ReadBufferTest(unittest.TestCase):
1060
1061 def test_array(self):
1062 import array
1063 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001064 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001065 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001066 )
1067
1068 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001069 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001070
1071 def test_bad_args(self):
1072 self.assertRaises(TypeError, codecs.readbuffer_encode)
1073 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1074
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001075class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001076 encoding = "utf-8-sig"
Victor Stinner01ada392015-10-01 21:54:51 +02001077 BOM = codecs.BOM_UTF8
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001078
1079 def test_partial(self):
1080 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001081 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001082 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001083 "",
1084 "",
1085 "", # First BOM has been read and skipped
1086 "",
1087 "",
1088 "\ufeff", # Second BOM has been read and emitted
1089 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001090 "\ufeff\x00", # First byte of encoded "\xff" read
1091 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1092 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1093 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001094 "\ufeff\x00\xff\u07ff",
1095 "\ufeff\x00\xff\u07ff",
1096 "\ufeff\x00\xff\u07ff\u0800",
1097 "\ufeff\x00\xff\u07ff\u0800",
1098 "\ufeff\x00\xff\u07ff\u0800",
1099 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001100 "\ufeff\x00\xff\u07ff\u0800\uffff",
1101 "\ufeff\x00\xff\u07ff\u0800\uffff",
1102 "\ufeff\x00\xff\u07ff\u0800\uffff",
1103 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001104 ]
1105 )
1106
Thomas Wouters89f507f2006-12-13 04:49:30 +00001107 def test_bug1601501(self):
1108 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001109 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001110
Walter Dörwald3abcb012007-04-16 22:10:50 +00001111 def test_bom(self):
1112 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001114 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1115
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001116 def test_stream_bom(self):
1117 unistring = "ABC\u00A1\u2200XYZ"
1118 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1119
1120 reader = codecs.getreader("utf-8-sig")
1121 for sizehint in [None] + list(range(1, 11)) + \
1122 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001123 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001124 ostream = io.StringIO()
1125 while 1:
1126 if sizehint is not None:
1127 data = istream.read(sizehint)
1128 else:
1129 data = istream.read()
1130
1131 if not data:
1132 break
1133 ostream.write(data)
1134
1135 got = ostream.getvalue()
1136 self.assertEqual(got, unistring)
1137
1138 def test_stream_bare(self):
1139 unistring = "ABC\u00A1\u2200XYZ"
1140 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1141
1142 reader = codecs.getreader("utf-8-sig")
1143 for sizehint in [None] + list(range(1, 11)) + \
1144 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001145 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001146 ostream = io.StringIO()
1147 while 1:
1148 if sizehint is not None:
1149 data = istream.read(sizehint)
1150 else:
1151 data = istream.read()
1152
1153 if not data:
1154 break
1155 ostream.write(data)
1156
1157 got = ostream.getvalue()
1158 self.assertEqual(got, unistring)
1159
1160class EscapeDecodeTest(unittest.TestCase):
1161 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001162 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001163 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001164
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001165 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001166 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001167 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001168 b = bytes([b])
1169 if b != b'\\':
1170 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001171
1172 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001173 decode = codecs.escape_decode
1174 check = coding_checker(self, decode)
1175 check(b"[\\\n]", b"[]")
1176 check(br'[\"]', b'["]')
1177 check(br"[\']", b"[']")
1178 check(br"[\\]", br"[\]")
1179 check(br"[\a]", b"[\x07]")
1180 check(br"[\b]", b"[\x08]")
1181 check(br"[\t]", b"[\x09]")
1182 check(br"[\n]", b"[\x0a]")
1183 check(br"[\v]", b"[\x0b]")
1184 check(br"[\f]", b"[\x0c]")
1185 check(br"[\r]", b"[\x0d]")
1186 check(br"[\7]", b"[\x07]")
1187 check(br"[\8]", br"[\8]")
1188 check(br"[\78]", b"[\x078]")
1189 check(br"[\41]", b"[!]")
1190 check(br"[\418]", b"[!8]")
1191 check(br"[\101]", b"[A]")
1192 check(br"[\1010]", b"[A0]")
1193 check(br"[\501]", b"[A]")
1194 check(br"[\x41]", b"[A]")
1195 check(br"[\X41]", br"[\X41]")
1196 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001197 for b in range(256):
1198 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001199 b = bytes([b])
1200 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001201
1202 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001203 decode = codecs.escape_decode
1204 self.assertRaises(ValueError, decode, br"\x")
1205 self.assertRaises(ValueError, decode, br"[\x]")
1206 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1207 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1208 self.assertRaises(ValueError, decode, br"\x0")
1209 self.assertRaises(ValueError, decode, br"[\x0]")
1210 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1211 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001212
Victor Stinnerf96418d2015-09-21 23:06:27 +02001213
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001214class RecodingTest(unittest.TestCase):
1215 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001216 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001217 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001218 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001219 f2.close()
1220 # Python used to crash on this at exit because of a refcount
1221 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001222
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001223 self.assertTrue(f.closed)
1224
Martin v. Löwis2548c732003-04-18 10:39:54 +00001225# From RFC 3492
1226punycode_testcases = [
1227 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001228 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1229 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001233 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001235 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001236 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001237 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001238 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1239 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1240 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001241 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001243 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1244 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1245 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001246 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001249 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1250 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1251 "\u0939\u0948\u0902",
1252 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001253
1254 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001255 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1257 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001258
1259 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1261 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1262 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001263 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1264 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265
1266 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1268 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1269 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1270 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001271 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272
1273 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1275 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1276 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1277 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1278 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001279 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280
1281 # (K) Vietnamese:
1282 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1283 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1285 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1286 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1287 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001288 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289
Martin v. Löwis2548c732003-04-18 10:39:54 +00001290 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001291 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001292 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001293
Martin v. Löwis2548c732003-04-18 10:39:54 +00001294 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001295 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1296 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1297 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001298 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299
1300 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001301 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1302 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1303 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001304 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305
1306 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001307 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001308 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309
1310 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001311 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1312 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001313 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314
1315 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001316 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001317 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318
1319 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001320 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001321 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001322
1323 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001324 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1325 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001326 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327 ]
1328
1329for i in punycode_testcases:
1330 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001331 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332
Victor Stinnerf96418d2015-09-21 23:06:27 +02001333
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334class PunycodeTest(unittest.TestCase):
1335 def test_encode(self):
1336 for uni, puny in punycode_testcases:
1337 # Need to convert both strings to lower case, since
1338 # some of the extended encodings use upper case, but our
1339 # code produces only lower case. Converting just puny to
1340 # lower is also insufficient, since some of the input characters
1341 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001342 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001343 str(uni.encode("punycode"), "ascii").lower(),
1344 str(puny, "ascii").lower()
1345 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346
1347 def test_decode(self):
1348 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001349 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001350 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001351 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352
Victor Stinnerf96418d2015-09-21 23:06:27 +02001353
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001354class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001355 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001356 def test_bug1251300(self):
1357 # Decoding with unicode_internal used to not correctly handle "code
1358 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001359 ok = [
1360 (b"\x00\x10\xff\xff", "\U0010ffff"),
1361 (b"\x00\x00\x01\x01", "\U00000101"),
1362 (b"", ""),
1363 ]
1364 not_ok = [
1365 b"\x7f\xff\xff\xff",
1366 b"\x80\x00\x00\x00",
1367 b"\x81\x00\x00\x00",
1368 b"\x00",
1369 b"\x00\x00\x00\x00\x00",
1370 ]
1371 for internal, uni in ok:
1372 if sys.byteorder == "little":
1373 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001374 with support.check_warnings():
1375 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001376 for internal in not_ok:
1377 if sys.byteorder == "little":
1378 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001379 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001380 'deprecated', DeprecationWarning)):
1381 self.assertRaises(UnicodeDecodeError, internal.decode,
1382 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001383 if sys.byteorder == "little":
1384 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001385 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001386 else:
1387 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001388 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001389 with support.check_warnings():
1390 self.assertRaises(UnicodeDecodeError,
1391 invalid.decode, "unicode_internal")
1392 with support.check_warnings():
1393 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1394 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001395 with support.check_warnings():
1396 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1397 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001398
Victor Stinner182d90d2011-09-29 19:53:55 +02001399 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001400 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001401 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001402 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001403 'deprecated', DeprecationWarning)):
1404 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001405 except UnicodeDecodeError as ex:
1406 self.assertEqual("unicode_internal", ex.encoding)
1407 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1408 self.assertEqual(4, ex.start)
1409 self.assertEqual(8, ex.end)
1410 else:
1411 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001412
Victor Stinner182d90d2011-09-29 19:53:55 +02001413 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001414 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001415 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1416 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001417 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001418 'deprecated', DeprecationWarning)):
1419 ab = "ab".encode("unicode_internal").decode()
1420 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1421 "ascii"),
1422 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001423 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001424
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001425 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001426 with support.check_warnings(('unicode_internal codec has been '
1427 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001428 # Issue 3739
1429 encoder = codecs.getencoder("unicode_internal")
1430 self.assertEqual(encoder("a")[1], 1)
1431 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1432
1433 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001434
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1436nameprep_tests = [
1437 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1439 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1440 b'\xb8\x8f\xef\xbb\xbf',
1441 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'CAFE',
1444 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 # 3.3 Case folding 8bit U+00DF (german sharp s).
1446 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xc3\x9f',
1448 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001449 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xc4\xb0',
1451 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001452 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xc5\x83\xcd\xba',
1454 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1456 # XXX: skip this as it fails in UCS-2 mode
1457 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1458 # 'telc\xe2\x88\x95kg\xcf\x83'),
1459 (None, None),
1460 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1462 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xe1\xbe\xb7',
1465 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 # 3.9 Self-reverting case folding U+01F0 and normalization.
1467 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'\xc7\xb0',
1469 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'\xce\x90',
1472 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xce\xb0',
1475 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xe1\xba\x96',
1478 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'\xe1\xbd\x96',
1481 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 (b' ',
1484 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 (b'\xc2\xa0',
1487 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001488 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001490 None),
1491 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'\xe2\x80\x80',
1493 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001494 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001495 (b'\xe2\x80\x8b',
1496 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001498 (b'\xe3\x80\x80',
1499 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001500 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001501 (b'\x10\x7f',
1502 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001503 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001505 None),
1506 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001507 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001508 None),
1509 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001510 (b'\xef\xbb\xbf',
1511 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001512 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001513 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001514 None),
1515 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001516 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001517 None),
1518 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001519 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520 None),
1521 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001522 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001523 None),
1524 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001525 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001526 None),
1527 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001528 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001529 None),
1530 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001531 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001532 None),
1533 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001534 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001535 None),
1536 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001537 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001538 None),
1539 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001540 (b'\xcd\x81',
1541 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001542 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001543 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001544 None),
1545 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001546 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001547 None),
1548 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001549 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001550 None),
1551 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001552 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001553 None),
1554 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001555 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001556 None),
1557 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001558 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001559 None),
1560 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001561 (b'foo\xef\xb9\xb6bar',
1562 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001563 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001564 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001565 None),
1566 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001567 (b'\xd8\xa71\xd8\xa8',
1568 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001569 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001570 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001572 # None),
1573 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001574 # 3.44 Larger test (shrinking).
1575 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1577 b'\xaa\xce\xb0\xe2\x80\x80',
1578 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001579 # 3.45 Larger test (expanding).
1580 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001581 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1582 b'\x80',
1583 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1584 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1585 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001586 ]
1587
1588
1589class NameprepTest(unittest.TestCase):
1590 def test_nameprep(self):
1591 from encodings.idna import nameprep
1592 for pos, (orig, prepped) in enumerate(nameprep_tests):
1593 if orig is None:
1594 # Skipped
1595 continue
1596 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001597 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001598 if prepped is None:
1599 # Input contains prohibited characters
1600 self.assertRaises(UnicodeError, nameprep, orig)
1601 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001602 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001603 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001604 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001605 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001606 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001607
Victor Stinnerf96418d2015-09-21 23:06:27 +02001608
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609class IDNACodecTest(unittest.TestCase):
1610 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001611 self.assertEqual(str(b"python.org", "idna"), "python.org")
1612 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1613 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1614 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001615
1616 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001617 self.assertEqual("python.org".encode("idna"), b"python.org")
1618 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1619 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1620 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001621
Martin v. Löwis8b595142005-08-25 11:03:38 +00001622 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001623 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001624 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001625 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001626
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001627 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001628 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001629 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001630 "python.org"
1631 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001633 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001634 "python.org."
1635 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001636 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001637 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001638 "pyth\xf6n.org."
1639 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001640 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001641 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001642 "pyth\xf6n.org."
1643 )
1644
1645 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001646 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1647 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1648 self.assertEqual(decoder.decode(b"rg"), "")
1649 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001650
1651 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1653 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1654 self.assertEqual(decoder.decode(b"rg."), "org.")
1655 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001656
1657 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001658 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001659 b"".join(codecs.iterencode("python.org", "idna")),
1660 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001661 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001662 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001663 b"".join(codecs.iterencode("python.org.", "idna")),
1664 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001665 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001666 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001667 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1668 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001669 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001670 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001671 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1672 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001673 )
1674
1675 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001676 self.assertEqual(encoder.encode("\xe4x"), b"")
1677 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1678 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001679
1680 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001681 self.assertEqual(encoder.encode("\xe4x"), b"")
1682 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1683 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001684
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001685 def test_errors(self):
1686 """Only supports "strict" error handler"""
1687 "python.org".encode("idna", "strict")
1688 b"python.org".decode("idna", "strict")
1689 for errors in ("ignore", "replace", "backslashreplace",
1690 "surrogateescape"):
1691 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1692 self.assertRaises(Exception,
1693 b"python.org".decode, "idna", errors)
1694
Victor Stinnerf96418d2015-09-21 23:06:27 +02001695
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001696class CodecsModuleTest(unittest.TestCase):
1697
1698 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001699 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1700 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001701 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001702 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001703 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001704
Victor Stinnera57dfd02014-05-14 17:13:14 +02001705 # test keywords
1706 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1707 '\xe4\xf6\xfc')
1708 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1709 '[]')
1710
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001711 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001712 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1713 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001714 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001715 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001716 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001717 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001718
Victor Stinnera57dfd02014-05-14 17:13:14 +02001719 # test keywords
1720 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1721 b'\xe4\xf6\xfc')
1722 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1723 b'[]')
1724
Walter Dörwald063e1e82004-10-28 13:04:26 +00001725 def test_register(self):
1726 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001727 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001728
1729 def test_lookup(self):
1730 self.assertRaises(TypeError, codecs.lookup)
1731 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001732 self.assertRaises(LookupError, codecs.lookup, " ")
1733
1734 def test_getencoder(self):
1735 self.assertRaises(TypeError, codecs.getencoder)
1736 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1737
1738 def test_getdecoder(self):
1739 self.assertRaises(TypeError, codecs.getdecoder)
1740 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1741
1742 def test_getreader(self):
1743 self.assertRaises(TypeError, codecs.getreader)
1744 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1745
1746 def test_getwriter(self):
1747 self.assertRaises(TypeError, codecs.getwriter)
1748 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001749
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001750 def test_lookup_issue1813(self):
1751 # Issue #1813: under Turkish locales, lookup of some codecs failed
1752 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001753 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001754 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1755 try:
1756 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1757 except locale.Error:
1758 # Unsupported locale on this system
1759 self.skipTest('test needs Turkish locale')
1760 c = codecs.lookup('ASCII')
1761 self.assertEqual(c.name, 'ascii')
1762
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001763 def test_all(self):
1764 api = (
1765 "encode", "decode",
1766 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1767 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1768 "getencoder", "getdecoder", "getincrementalencoder",
1769 "getincrementaldecoder", "getreader", "getwriter",
1770 "register_error", "lookup_error",
1771 "strict_errors", "replace_errors", "ignore_errors",
1772 "xmlcharrefreplace_errors", "backslashreplace_errors",
1773 "namereplace_errors",
1774 "open", "EncodedFile",
1775 "iterencode", "iterdecode",
1776 "BOM", "BOM_BE", "BOM_LE",
1777 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1778 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1779 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1780 "StreamReaderWriter", "StreamRecoder",
1781 )
1782 self.assertCountEqual(api, codecs.__all__)
1783 for api in codecs.__all__:
1784 getattr(codecs, api)
1785
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001786 def test_open(self):
1787 self.addCleanup(support.unlink, support.TESTFN)
1788 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1789 with self.subTest(mode), \
1790 codecs.open(support.TESTFN, mode, 'ascii') as file:
1791 self.assertIsInstance(file, codecs.StreamReaderWriter)
1792
1793 def test_undefined(self):
1794 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1795 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1796 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1797 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1798 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1799 self.assertRaises(UnicodeError,
1800 codecs.encode, 'abc', 'undefined', errors)
1801 self.assertRaises(UnicodeError,
1802 codecs.decode, b'abc', 'undefined', errors)
1803
Victor Stinnerf96418d2015-09-21 23:06:27 +02001804
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001805class StreamReaderTest(unittest.TestCase):
1806
1807 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001808 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001809 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001810
1811 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001812 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001813 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001814
Victor Stinnerf96418d2015-09-21 23:06:27 +02001815
Thomas Wouters89f507f2006-12-13 04:49:30 +00001816class EncodedFileTest(unittest.TestCase):
1817
1818 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001819 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001820 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001821 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001822
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001823 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001824 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001825 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001826 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001827
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001828all_unicode_encodings = [
1829 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001830 "big5",
1831 "big5hkscs",
1832 "charmap",
1833 "cp037",
1834 "cp1006",
1835 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001836 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001837 "cp1140",
1838 "cp1250",
1839 "cp1251",
1840 "cp1252",
1841 "cp1253",
1842 "cp1254",
1843 "cp1255",
1844 "cp1256",
1845 "cp1257",
1846 "cp1258",
1847 "cp424",
1848 "cp437",
1849 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001850 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001851 "cp737",
1852 "cp775",
1853 "cp850",
1854 "cp852",
1855 "cp855",
1856 "cp856",
1857 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001858 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001859 "cp860",
1860 "cp861",
1861 "cp862",
1862 "cp863",
1863 "cp864",
1864 "cp865",
1865 "cp866",
1866 "cp869",
1867 "cp874",
1868 "cp875",
1869 "cp932",
1870 "cp949",
1871 "cp950",
1872 "euc_jis_2004",
1873 "euc_jisx0213",
1874 "euc_jp",
1875 "euc_kr",
1876 "gb18030",
1877 "gb2312",
1878 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001879 "hp_roman8",
1880 "hz",
1881 "idna",
1882 "iso2022_jp",
1883 "iso2022_jp_1",
1884 "iso2022_jp_2",
1885 "iso2022_jp_2004",
1886 "iso2022_jp_3",
1887 "iso2022_jp_ext",
1888 "iso2022_kr",
1889 "iso8859_1",
1890 "iso8859_10",
1891 "iso8859_11",
1892 "iso8859_13",
1893 "iso8859_14",
1894 "iso8859_15",
1895 "iso8859_16",
1896 "iso8859_2",
1897 "iso8859_3",
1898 "iso8859_4",
1899 "iso8859_5",
1900 "iso8859_6",
1901 "iso8859_7",
1902 "iso8859_8",
1903 "iso8859_9",
1904 "johab",
1905 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001906 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001907 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001908 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001909 "latin_1",
1910 "mac_cyrillic",
1911 "mac_greek",
1912 "mac_iceland",
1913 "mac_latin2",
1914 "mac_roman",
1915 "mac_turkish",
1916 "palmos",
1917 "ptcp154",
1918 "punycode",
1919 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001920 "shift_jis",
1921 "shift_jis_2004",
1922 "shift_jisx0213",
1923 "tis_620",
1924 "unicode_escape",
1925 "unicode_internal",
1926 "utf_16",
1927 "utf_16_be",
1928 "utf_16_le",
1929 "utf_7",
1930 "utf_8",
1931]
1932
1933if hasattr(codecs, "mbcs_encode"):
1934 all_unicode_encodings.append("mbcs")
Steve Dowerf5aba582016-09-06 19:42:27 -07001935if hasattr(codecs, "oem_encode"):
1936 all_unicode_encodings.append("oem")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001937
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001938# The following encoding is not tested, because it's not supposed
1939# to work:
1940# "undefined"
1941
1942# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001943broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001944 "punycode",
1945 "unicode_internal"
1946]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001947
Victor Stinnerf96418d2015-09-21 23:06:27 +02001948
Walter Dörwald3abcb012007-04-16 22:10:50 +00001949class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001950 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001951 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001952 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001953 name = codecs.lookup(encoding).name
1954 if encoding.endswith("_codec"):
1955 name += "_codec"
1956 elif encoding == "latin_1":
1957 name = "latin_1"
1958 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001959
Ezio Melottiadc417c2011-11-17 12:23:34 +02001960 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001961 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001962 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001963 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001964 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001965 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001966
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001967 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001968 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001969 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001970 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001971 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001972 for c in s:
1973 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001974 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001975 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001976 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001977 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001978 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001979 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001980 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001981 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001982 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001983 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001984
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001985 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001986 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001987 try:
1988 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001989 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001990 pass
1991 else:
1992 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001993 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001994 for c in s:
1995 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001996 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001997 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001998 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001999 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002000 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002001 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002002 self.assertEqual(decodedresult, s,
2003 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002004
2005 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002006 result = "".join(codecs.iterdecode(
2007 codecs.iterencode(s, encoding), encoding))
2008 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00002009
2010 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002011 result = "".join(codecs.iterdecode(
2012 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002013 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00002014
Victor Stinner554f3f02010-06-16 23:33:54 +00002015 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00002016 # check incremental decoder/encoder with errors argument
2017 try:
2018 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002019 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00002020 pass
2021 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002022 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002023 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002024 decodedresult = "".join(decoder.decode(bytes([c]))
2025 for c in encodedresult)
2026 self.assertEqual(decodedresult, s,
2027 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002028
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002029 @support.cpython_only
2030 def test_basics_capi(self):
2031 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2032 s = "abc123" # all codecs should be able to encode these
2033 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002034 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002035 # check incremental decoder/encoder (fetched via the C API)
2036 try:
2037 cencoder = codec_incrementalencoder(encoding)
2038 except LookupError: # no IncrementalEncoder
2039 pass
2040 else:
2041 # check C API
2042 encodedresult = b""
2043 for c in s:
2044 encodedresult += cencoder.encode(c)
2045 encodedresult += cencoder.encode("", True)
2046 cdecoder = codec_incrementaldecoder(encoding)
2047 decodedresult = ""
2048 for c in encodedresult:
2049 decodedresult += cdecoder.decode(bytes([c]))
2050 decodedresult += cdecoder.decode(b"", True)
2051 self.assertEqual(decodedresult, s,
2052 "encoding=%r" % encoding)
2053
2054 if encoding not in ("idna", "mbcs"):
2055 # check incremental decoder/encoder with errors argument
2056 try:
2057 cencoder = codec_incrementalencoder(encoding, "ignore")
2058 except LookupError: # no IncrementalEncoder
2059 pass
2060 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002061 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002062 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2063 decodedresult = "".join(cdecoder.decode(bytes([c]))
2064 for c in encodedresult)
2065 self.assertEqual(decodedresult, s,
2066 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002067
Walter Dörwald729c31f2005-03-14 19:06:30 +00002068 def test_seek(self):
2069 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002070 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002071 for encoding in all_unicode_encodings:
2072 if encoding == "idna": # FIXME: See SF bug #1163178
2073 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002074 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002075 continue
Victor Stinner05010702011-05-27 16:50:40 +02002076 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002077 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002078 # Test that calling seek resets the internal codec state and buffers
2079 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002080 data = reader.read()
2081 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002082
Walter Dörwalde22d3392005-11-17 08:52:34 +00002083 def test_bad_decode_args(self):
2084 for encoding in all_unicode_encodings:
2085 decoder = codecs.getdecoder(encoding)
2086 self.assertRaises(TypeError, decoder)
2087 if encoding not in ("idna", "punycode"):
2088 self.assertRaises(TypeError, decoder, 42)
2089
2090 def test_bad_encode_args(self):
2091 for encoding in all_unicode_encodings:
2092 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002093 with support.check_warnings():
2094 # unicode-internal has been deprecated
2095 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002096
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002097 def test_encoding_map_type_initialized(self):
2098 from encodings import cp1140
2099 # This used to crash, we are only verifying there's no crash.
2100 table_type = type(cp1140.encoding_table)
2101 self.assertEqual(table_type, table_type)
2102
Walter Dörwald3abcb012007-04-16 22:10:50 +00002103 def test_decoder_state(self):
2104 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002105 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002106 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002107 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002108 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2109 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2110
Victor Stinnerf96418d2015-09-21 23:06:27 +02002111
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002112class CharmapTest(unittest.TestCase):
2113 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002114 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002115 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002116 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002117 )
2118
Ezio Melottib3aedd42010-11-20 19:04:17 +00002119 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002120 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2121 ("\U0010FFFFbc", 3)
2122 )
2123
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002124 self.assertRaises(UnicodeDecodeError,
2125 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2126 )
2127
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002128 self.assertRaises(UnicodeDecodeError,
2129 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2130 )
2131
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002132 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002133 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002134 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002135 )
2136
Ezio Melottib3aedd42010-11-20 19:04:17 +00002137 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002138 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002139 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002140 )
2141
Ezio Melottib3aedd42010-11-20 19:04:17 +00002142 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002143 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2144 ("ab\\x02", 3)
2145 )
2146
2147 self.assertEqual(
2148 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2149 ("ab\\x02", 3)
2150 )
2151
2152 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002153 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002154 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002155 )
2156
Ezio Melottib3aedd42010-11-20 19:04:17 +00002157 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002158 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002159 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002160 )
2161
Guido van Rossum805365e2007-05-07 22:24:25 +00002162 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002163 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002164 codecs.charmap_decode(allbytes, "ignore", ""),
2165 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002166 )
2167
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002168 def test_decode_with_int2str_map(self):
2169 self.assertEqual(
2170 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2171 {0: 'a', 1: 'b', 2: 'c'}),
2172 ("abc", 3)
2173 )
2174
2175 self.assertEqual(
2176 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2177 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2178 ("AaBbCc", 3)
2179 )
2180
2181 self.assertEqual(
2182 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2183 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2184 ("\U0010FFFFbc", 3)
2185 )
2186
2187 self.assertEqual(
2188 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2189 {0: 'a', 1: 'b', 2: ''}),
2190 ("ab", 3)
2191 )
2192
2193 self.assertRaises(UnicodeDecodeError,
2194 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2195 {0: 'a', 1: 'b'}
2196 )
2197
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002198 self.assertRaises(UnicodeDecodeError,
2199 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2200 {0: 'a', 1: 'b', 2: None}
2201 )
2202
2203 # Issue #14850
2204 self.assertRaises(UnicodeDecodeError,
2205 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2206 {0: 'a', 1: 'b', 2: '\ufffe'}
2207 )
2208
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002209 self.assertEqual(
2210 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2211 {0: 'a', 1: 'b'}),
2212 ("ab\ufffd", 3)
2213 )
2214
2215 self.assertEqual(
2216 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2217 {0: 'a', 1: 'b', 2: None}),
2218 ("ab\ufffd", 3)
2219 )
2220
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002221 # Issue #14850
2222 self.assertEqual(
2223 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2224 {0: 'a', 1: 'b', 2: '\ufffe'}),
2225 ("ab\ufffd", 3)
2226 )
2227
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002228 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002229 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2230 {0: 'a', 1: 'b'}),
2231 ("ab\\x02", 3)
2232 )
2233
2234 self.assertEqual(
2235 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2236 {0: 'a', 1: 'b', 2: None}),
2237 ("ab\\x02", 3)
2238 )
2239
2240 # Issue #14850
2241 self.assertEqual(
2242 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2243 {0: 'a', 1: 'b', 2: '\ufffe'}),
2244 ("ab\\x02", 3)
2245 )
2246
2247 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002248 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2249 {0: 'a', 1: 'b'}),
2250 ("ab", 3)
2251 )
2252
2253 self.assertEqual(
2254 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2255 {0: 'a', 1: 'b', 2: None}),
2256 ("ab", 3)
2257 )
2258
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002259 # Issue #14850
2260 self.assertEqual(
2261 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2262 {0: 'a', 1: 'b', 2: '\ufffe'}),
2263 ("ab", 3)
2264 )
2265
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002266 allbytes = bytes(range(256))
2267 self.assertEqual(
2268 codecs.charmap_decode(allbytes, "ignore", {}),
2269 ("", len(allbytes))
2270 )
2271
2272 def test_decode_with_int2int_map(self):
2273 a = ord('a')
2274 b = ord('b')
2275 c = ord('c')
2276
2277 self.assertEqual(
2278 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2279 {0: a, 1: b, 2: c}),
2280 ("abc", 3)
2281 )
2282
2283 # Issue #15379
2284 self.assertEqual(
2285 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2286 {0: 0x10FFFF, 1: b, 2: c}),
2287 ("\U0010FFFFbc", 3)
2288 )
2289
Antoine Pitroua1f76552012-09-23 20:00:04 +02002290 self.assertEqual(
2291 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2292 {0: sys.maxunicode, 1: b, 2: c}),
2293 (chr(sys.maxunicode) + "bc", 3)
2294 )
2295
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002296 self.assertRaises(TypeError,
2297 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002298 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002299 )
2300
2301 self.assertRaises(UnicodeDecodeError,
2302 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2303 {0: a, 1: b},
2304 )
2305
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002306 self.assertRaises(UnicodeDecodeError,
2307 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2308 {0: a, 1: b, 2: 0xFFFE},
2309 )
2310
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002311 self.assertEqual(
2312 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2313 {0: a, 1: b}),
2314 ("ab\ufffd", 3)
2315 )
2316
2317 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002318 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2319 {0: a, 1: b, 2: 0xFFFE}),
2320 ("ab\ufffd", 3)
2321 )
2322
2323 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002324 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2325 {0: a, 1: b}),
2326 ("ab\\x02", 3)
2327 )
2328
2329 self.assertEqual(
2330 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2331 {0: a, 1: b, 2: 0xFFFE}),
2332 ("ab\\x02", 3)
2333 )
2334
2335 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002336 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2337 {0: a, 1: b}),
2338 ("ab", 3)
2339 )
2340
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002341 self.assertEqual(
2342 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2343 {0: a, 1: b, 2: 0xFFFE}),
2344 ("ab", 3)
2345 )
2346
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002347
Thomas Wouters89f507f2006-12-13 04:49:30 +00002348class WithStmtTest(unittest.TestCase):
2349 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002350 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002351 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2352 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002353 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002354
2355 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002356 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002357 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002358 with codecs.StreamReaderWriter(f, info.streamreader,
2359 info.streamwriter, 'strict') as srw:
2360 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002361
Victor Stinnerf96418d2015-09-21 23:06:27 +02002362
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002363class TypesTest(unittest.TestCase):
2364 def test_decode_unicode(self):
2365 # Most decoders don't accept unicode input
2366 decoders = [
2367 codecs.utf_7_decode,
2368 codecs.utf_8_decode,
2369 codecs.utf_16_le_decode,
2370 codecs.utf_16_be_decode,
2371 codecs.utf_16_ex_decode,
2372 codecs.utf_32_decode,
2373 codecs.utf_32_le_decode,
2374 codecs.utf_32_be_decode,
2375 codecs.utf_32_ex_decode,
2376 codecs.latin_1_decode,
2377 codecs.ascii_decode,
2378 codecs.charmap_decode,
2379 ]
2380 if hasattr(codecs, "mbcs_decode"):
2381 decoders.append(codecs.mbcs_decode)
2382 for decoder in decoders:
2383 self.assertRaises(TypeError, decoder, "xxx")
2384
2385 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002386 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002387 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002388 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2389 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2390 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2391 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002392
Victor Stinnere3b47152011-12-09 20:49:49 +01002393 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2394 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002395 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2396 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002397
2398 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2399 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002400 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2401 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002402
Serhiy Storchakad6793772013-01-29 10:20:44 +02002403
2404class UnicodeEscapeTest(unittest.TestCase):
2405 def test_empty(self):
2406 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2407 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2408
2409 def test_raw_encode(self):
2410 encode = codecs.unicode_escape_encode
2411 for b in range(32, 127):
2412 if b != b'\\'[0]:
2413 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2414
2415 def test_raw_decode(self):
2416 decode = codecs.unicode_escape_decode
2417 for b in range(256):
2418 if b != b'\\'[0]:
2419 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2420
2421 def test_escape_encode(self):
2422 encode = codecs.unicode_escape_encode
2423 check = coding_checker(self, encode)
2424 check('\t', br'\t')
2425 check('\n', br'\n')
2426 check('\r', br'\r')
2427 check('\\', br'\\')
2428 for b in range(32):
2429 if chr(b) not in '\t\n\r':
2430 check(chr(b), ('\\x%02x' % b).encode())
2431 for b in range(127, 256):
2432 check(chr(b), ('\\x%02x' % b).encode())
2433 check('\u20ac', br'\u20ac')
2434 check('\U0001d120', br'\U0001d120')
2435
2436 def test_escape_decode(self):
2437 decode = codecs.unicode_escape_decode
2438 check = coding_checker(self, decode)
2439 check(b"[\\\n]", "[]")
2440 check(br'[\"]', '["]')
2441 check(br"[\']", "[']")
2442 check(br"[\\]", r"[\]")
2443 check(br"[\a]", "[\x07]")
2444 check(br"[\b]", "[\x08]")
2445 check(br"[\t]", "[\x09]")
2446 check(br"[\n]", "[\x0a]")
2447 check(br"[\v]", "[\x0b]")
2448 check(br"[\f]", "[\x0c]")
2449 check(br"[\r]", "[\x0d]")
2450 check(br"[\7]", "[\x07]")
2451 check(br"[\8]", r"[\8]")
2452 check(br"[\78]", "[\x078]")
2453 check(br"[\41]", "[!]")
2454 check(br"[\418]", "[!8]")
2455 check(br"[\101]", "[A]")
2456 check(br"[\1010]", "[A0]")
2457 check(br"[\x41]", "[A]")
2458 check(br"[\x410]", "[A0]")
2459 check(br"\u20ac", "\u20ac")
2460 check(br"\U0001d120", "\U0001d120")
2461 for b in range(256):
2462 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2463 check(b'\\' + bytes([b]), '\\' + chr(b))
2464
2465 def test_decode_errors(self):
2466 decode = codecs.unicode_escape_decode
2467 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2468 for i in range(d):
2469 self.assertRaises(UnicodeDecodeError, decode,
2470 b"\\" + c + b"0"*i)
2471 self.assertRaises(UnicodeDecodeError, decode,
2472 b"[\\" + c + b"0"*i + b"]")
2473 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2474 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2475 self.assertEqual(decode(data, "replace"),
2476 ("[\ufffd]\ufffd", len(data)))
2477 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2478 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2479 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2480
2481
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002482class RawUnicodeEscapeTest(unittest.TestCase):
2483 def test_empty(self):
2484 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2485 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2486
2487 def test_raw_encode(self):
2488 encode = codecs.raw_unicode_escape_encode
2489 for b in range(256):
2490 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2491
2492 def test_raw_decode(self):
2493 decode = codecs.raw_unicode_escape_decode
2494 for b in range(256):
2495 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2496
2497 def test_escape_encode(self):
2498 encode = codecs.raw_unicode_escape_encode
2499 check = coding_checker(self, encode)
2500 for b in range(256):
2501 if b not in b'uU':
2502 check('\\' + chr(b), b'\\' + bytes([b]))
2503 check('\u20ac', br'\u20ac')
2504 check('\U0001d120', br'\U0001d120')
2505
2506 def test_escape_decode(self):
2507 decode = codecs.raw_unicode_escape_decode
2508 check = coding_checker(self, decode)
2509 for b in range(256):
2510 if b not in b'uU':
2511 check(b'\\' + bytes([b]), '\\' + chr(b))
2512 check(br"\u20ac", "\u20ac")
2513 check(br"\U0001d120", "\U0001d120")
2514
2515 def test_decode_errors(self):
2516 decode = codecs.raw_unicode_escape_decode
2517 for c, d in (b'u', 4), (b'U', 4):
2518 for i in range(d):
2519 self.assertRaises(UnicodeDecodeError, decode,
2520 b"\\" + c + b"0"*i)
2521 self.assertRaises(UnicodeDecodeError, decode,
2522 b"[\\" + c + b"0"*i + b"]")
2523 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2524 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2525 self.assertEqual(decode(data, "replace"),
2526 ("[\ufffd]\ufffd", len(data)))
2527 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2528 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2529 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2530
2531
Martin v. Löwis43c57782009-05-10 08:15:24 +00002532class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002533
2534 def test_utf8(self):
2535 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002536 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002537 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002538 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002539 b"foo\x80bar")
2540 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002541 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002542 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002543 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002544 b"\xed\xb0\x80")
2545
2546 def test_ascii(self):
2547 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002548 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002549 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002550 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002551 b"foo\x80bar")
2552
2553 def test_charmap(self):
2554 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002555 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002556 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002557 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002558 b"foo\xa5bar")
2559
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002560 def test_latin1(self):
2561 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002562 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002563 b"\xe4\xeb\xef\xf6\xfc")
2564
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002565
Victor Stinner3fed0872010-05-22 02:16:27 +00002566class BomTest(unittest.TestCase):
2567 def test_seek0(self):
2568 data = "1234567890"
2569 tests = ("utf-16",
2570 "utf-16-le",
2571 "utf-16-be",
2572 "utf-32",
2573 "utf-32-le",
2574 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002575 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002576 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002577 # Check if the BOM is written only once
2578 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002579 f.write(data)
2580 f.write(data)
2581 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002582 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002583 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002584 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002585
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002586 # Check that the BOM is written after a seek(0)
2587 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2588 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002589 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002590 f.seek(0)
2591 f.write(data)
2592 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002593 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002594
2595 # (StreamWriter) Check that the BOM is written after a seek(0)
2596 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002597 f.writer.write(data[0])
2598 self.assertNotEqual(f.writer.tell(), 0)
2599 f.writer.seek(0)
2600 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002601 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002602 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002603
Victor Stinner05010702011-05-27 16:50:40 +02002604 # Check that the BOM is not written after a seek() at a position
2605 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002606 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2607 f.write(data)
2608 f.seek(f.tell())
2609 f.write(data)
2610 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002611 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002612
Victor Stinner05010702011-05-27 16:50:40 +02002613 # (StreamWriter) Check that the BOM is not written after a seek()
2614 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002615 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002616 f.writer.write(data)
2617 f.writer.seek(f.writer.tell())
2618 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002619 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002620 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002621
Victor Stinner3fed0872010-05-22 02:16:27 +00002622
Georg Brandl02524622010-12-02 18:06:51 +00002623bytes_transform_encodings = [
2624 "base64_codec",
2625 "uu_codec",
2626 "quopri_codec",
2627 "hex_codec",
2628]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002629
2630transform_aliases = {
2631 "base64_codec": ["base64", "base_64"],
2632 "uu_codec": ["uu"],
2633 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2634 "hex_codec": ["hex"],
2635 "rot_13": ["rot13"],
2636}
2637
Georg Brandl02524622010-12-02 18:06:51 +00002638try:
2639 import zlib
2640except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002641 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002642else:
2643 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002644 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002645try:
2646 import bz2
2647except ImportError:
2648 pass
2649else:
2650 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002651 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002652
Victor Stinnerf96418d2015-09-21 23:06:27 +02002653
Georg Brandl02524622010-12-02 18:06:51 +00002654class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002655
Georg Brandl02524622010-12-02 18:06:51 +00002656 def test_basics(self):
2657 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002658 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002659 with self.subTest(encoding=encoding):
2660 # generic codecs interface
2661 (o, size) = codecs.getencoder(encoding)(binput)
2662 self.assertEqual(size, len(binput))
2663 (i, size) = codecs.getdecoder(encoding)(o)
2664 self.assertEqual(size, len(o))
2665 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002666
Georg Brandl02524622010-12-02 18:06:51 +00002667 def test_read(self):
2668 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002669 with self.subTest(encoding=encoding):
2670 sin = codecs.encode(b"\x80", encoding)
2671 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2672 sout = reader.read()
2673 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002674
2675 def test_readline(self):
2676 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677 with self.subTest(encoding=encoding):
2678 sin = codecs.encode(b"\x80", encoding)
2679 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2680 sout = reader.readline()
2681 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002682
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002683 def test_buffer_api_usage(self):
2684 # We check all the transform codecs accept memoryview input
2685 # for encoding and decoding
2686 # and also that they roundtrip correctly
2687 original = b"12345\x80"
2688 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002689 with self.subTest(encoding=encoding):
2690 data = original
2691 view = memoryview(data)
2692 data = codecs.encode(data, encoding)
2693 view_encoded = codecs.encode(view, encoding)
2694 self.assertEqual(view_encoded, data)
2695 view = memoryview(data)
2696 data = codecs.decode(data, encoding)
2697 self.assertEqual(data, original)
2698 view_decoded = codecs.decode(view, encoding)
2699 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002700
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002701 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002702 # Check binary -> binary codecs give a good error for str input
2703 bad_input = "bad input type"
2704 for encoding in bytes_transform_encodings:
2705 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002706 fmt = ( "{!r} is not a text encoding; "
2707 "use codecs.encode\(\) to handle arbitrary codecs")
2708 msg = fmt.format(encoding)
2709 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002710 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002711 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002712
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002713 def test_text_to_binary_blacklists_text_transforms(self):
2714 # Check str.encode gives a good error message for str -> str codecs
2715 msg = (r"^'rot_13' is not a text encoding; "
2716 "use codecs.encode\(\) to handle arbitrary codecs")
2717 with self.assertRaisesRegex(LookupError, msg):
2718 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002719
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002720 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002721 # Check bytes.decode and bytearray.decode give a good error
2722 # message for binary -> binary codecs
2723 data = b"encode first to ensure we meet any format restrictions"
2724 for encoding in bytes_transform_encodings:
2725 with self.subTest(encoding=encoding):
2726 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002727 fmt = (r"{!r} is not a text encoding; "
2728 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002729 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002730 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002731 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002732 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002733 bytearray(encoded_data).decode(encoding)
2734
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002735 def test_binary_to_text_blacklists_text_transforms(self):
2736 # Check str -> str codec gives a good error for binary input
2737 for bad_input in (b"immutable", bytearray(b"mutable")):
2738 with self.subTest(bad_input=bad_input):
2739 msg = (r"^'rot_13' is not a text encoding; "
2740 "use codecs.decode\(\) to handle arbitrary codecs")
2741 with self.assertRaisesRegex(LookupError, msg) as failure:
2742 bad_input.decode("rot_13")
2743 self.assertIsNone(failure.exception.__cause__)
2744
Zachary Wareefa2e042013-12-30 14:54:11 -06002745 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002746 def test_custom_zlib_error_is_wrapped(self):
2747 # Check zlib codec gives a good error for malformed input
2748 msg = "^decoding with 'zlib_codec' codec failed"
2749 with self.assertRaisesRegex(Exception, msg) as failure:
2750 codecs.decode(b"hello", "zlib_codec")
2751 self.assertIsInstance(failure.exception.__cause__,
2752 type(failure.exception))
2753
2754 def test_custom_hex_error_is_wrapped(self):
2755 # Check hex codec gives a good error for malformed input
2756 msg = "^decoding with 'hex_codec' codec failed"
2757 with self.assertRaisesRegex(Exception, msg) as failure:
2758 codecs.decode(b"hello", "hex_codec")
2759 self.assertIsInstance(failure.exception.__cause__,
2760 type(failure.exception))
2761
2762 # Unfortunately, the bz2 module throws OSError, which the codec
2763 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002764
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002765 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2766 def test_aliases(self):
2767 for codec_name, aliases in transform_aliases.items():
2768 expected_name = codecs.lookup(codec_name).name
2769 for alias in aliases:
2770 with self.subTest(alias=alias):
2771 info = codecs.lookup(alias)
2772 self.assertEqual(info.name, expected_name)
2773
Martin Panter06171bd2015-09-12 00:34:28 +00002774 def test_quopri_stateless(self):
2775 # Should encode with quotetabs=True
2776 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2777 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2778 # But should still support unescaped tabs and spaces
2779 unescaped = b"space tab eol\n"
2780 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2781
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002782 def test_uu_invalid(self):
2783 # Missing "begin" line
2784 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2785
Nick Coghlan8b097b42013-11-13 23:49:21 +10002786
2787# The codec system tries to wrap exceptions in order to ensure the error
2788# mentions the operation being performed and the codec involved. We
2789# currently *only* want this to happen for relatively stateless
2790# exceptions, where the only significant information they contain is their
2791# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002792
2793# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002794# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002795_TEST_CODECS = {}
2796
2797def _get_test_codec(codec_name):
2798 return _TEST_CODECS.get(codec_name)
2799codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2800
Nick Coghlan8fad1672014-09-15 23:50:44 +12002801try:
2802 # Issue #22166: Also need to clear the internal cache in CPython
2803 from _codecs import _forget_codec
2804except ImportError:
2805 def _forget_codec(codec_name):
2806 pass
2807
2808
Nick Coghlan8b097b42013-11-13 23:49:21 +10002809class ExceptionChainingTest(unittest.TestCase):
2810
2811 def setUp(self):
2812 # There's no way to unregister a codec search function, so we just
2813 # ensure we render this one fairly harmless after the test
2814 # case finishes by using the test case repr as the codec name
2815 # The codecs module normalizes codec names, although this doesn't
2816 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002817 # We also make sure we use a truly unique id for the custom codec
2818 # to avoid issues with the codec cache when running these tests
2819 # multiple times (e.g. when hunting for refleaks)
2820 unique_id = repr(self) + str(id(self))
2821 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2822
2823 # We store the object to raise on the instance because of a bad
2824 # interaction between the codec caching (which means we can't
2825 # recreate the codec entry) and regrtest refleak hunting (which
2826 # runs the same test instance multiple times). This means we
2827 # need to ensure the codecs call back in to the instance to find
2828 # out which exception to raise rather than binding them in a
2829 # closure to an object that may change on the next run
2830 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002831
Nick Coghlan4e553e22013-11-16 00:35:34 +10002832 def tearDown(self):
2833 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002834 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2835 encodings._cache.pop(self.codec_name, None)
2836 try:
2837 _forget_codec(self.codec_name)
2838 except KeyError:
2839 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002840
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002841 def set_codec(self, encode, decode):
2842 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002843 name=self.codec_name)
2844 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002845
2846 @contextlib.contextmanager
2847 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002848 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002849 operation, self.codec_name, exc_type.__name__, msg)
2850 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2851 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002852 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002853 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002854
2855 def raise_obj(self, *args, **kwds):
2856 # Helper to dynamically change the object raised by a test codec
2857 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002858
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002859 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002860 self.obj_to_raise = obj_to_raise
2861 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002862 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002863 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002864 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002865 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002866 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002867 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002868 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002869 codecs.decode(b"bytes input", self.codec_name)
2870
2871 def test_raise_by_type(self):
2872 self.check_wrapped(RuntimeError, "")
2873
2874 def test_raise_by_value(self):
2875 msg = "This should be wrapped"
2876 self.check_wrapped(RuntimeError(msg), msg)
2877
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002878 def test_raise_grandchild_subclass_exact_size(self):
2879 msg = "This should be wrapped"
2880 class MyRuntimeError(RuntimeError):
2881 __slots__ = ()
2882 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2883
2884 def test_raise_subclass_with_weakref_support(self):
2885 msg = "This should be wrapped"
2886 class MyRuntimeError(RuntimeError):
2887 pass
2888 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2889
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002890 def check_not_wrapped(self, obj_to_raise, msg):
2891 def raise_obj(*args, **kwds):
2892 raise obj_to_raise
2893 self.set_codec(raise_obj, raise_obj)
2894 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002895 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002896 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002897 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002898 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002899 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002900 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002901 codecs.decode(b"bytes input", self.codec_name)
2902
2903 def test_init_override_is_not_wrapped(self):
2904 class CustomInit(RuntimeError):
2905 def __init__(self):
2906 pass
2907 self.check_not_wrapped(CustomInit, "")
2908
2909 def test_new_override_is_not_wrapped(self):
2910 class CustomNew(RuntimeError):
2911 def __new__(cls):
2912 return super().__new__(cls)
2913 self.check_not_wrapped(CustomNew, "")
2914
2915 def test_instance_attribute_is_not_wrapped(self):
2916 msg = "This should NOT be wrapped"
2917 exc = RuntimeError(msg)
2918 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002919 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002920
2921 def test_non_str_arg_is_not_wrapped(self):
2922 self.check_not_wrapped(RuntimeError(1), "1")
2923
2924 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002925 msg_re = r"^\('a', 'b', 'c'\)$"
2926 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002927
2928 # http://bugs.python.org/issue19609
2929 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002930 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002931 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002932 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002933 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002934 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002935 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002936 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002937 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002938 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002939 codecs.decode(b"bytes input", self.codec_name)
2940
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002941 def test_unflagged_non_text_codec_handling(self):
2942 # The stdlib non-text codecs are now marked so they're
2943 # pre-emptively skipped by the text model related methods
2944 # However, third party codecs won't be flagged, so we still make
2945 # sure the case where an inappropriate output type is produced is
2946 # handled appropriately
2947 def encode_to_str(*args, **kwds):
2948 return "not bytes!", 0
2949 def decode_to_bytes(*args, **kwds):
2950 return b"not str!", 0
2951 self.set_codec(encode_to_str, decode_to_bytes)
2952 # No input or output type checks on the codecs module functions
2953 encoded = codecs.encode(None, self.codec_name)
2954 self.assertEqual(encoded, "not bytes!")
2955 decoded = codecs.decode(None, self.codec_name)
2956 self.assertEqual(decoded, b"not str!")
2957 # Text model methods should complain
2958 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2959 "use codecs.encode\(\) to encode to arbitrary types$")
2960 msg = fmt.format(self.codec_name)
2961 with self.assertRaisesRegex(TypeError, msg):
2962 "str_input".encode(self.codec_name)
2963 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2964 "use codecs.decode\(\) to decode to arbitrary types$")
2965 msg = fmt.format(self.codec_name)
2966 with self.assertRaisesRegex(TypeError, msg):
2967 b"bytes input".decode(self.codec_name)
2968
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002969
Georg Brandl02524622010-12-02 18:06:51 +00002970
Victor Stinner62be4fb2011-10-18 21:46:37 +02002971@unittest.skipUnless(sys.platform == 'win32',
2972 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002973class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002974 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002975 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002976
Victor Stinner3a50e702011-10-18 21:21:00 +02002977 def test_invalid_code_page(self):
2978 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2979 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002980 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2981 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002982
2983 def test_code_page_name(self):
2984 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2985 codecs.code_page_encode, 932, '\xff')
2986 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002987 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002988 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002989 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002990
2991 def check_decode(self, cp, tests):
2992 for raw, errors, expected in tests:
2993 if expected is not None:
2994 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002995 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002996 except UnicodeDecodeError as err:
2997 self.fail('Unable to decode %a from "cp%s" with '
2998 'errors=%r: %s' % (raw, cp, errors, err))
2999 self.assertEqual(decoded[0], expected,
3000 '%a.decode("cp%s", %r)=%a != %a'
3001 % (raw, cp, errors, decoded[0], expected))
3002 # assert 0 <= decoded[1] <= len(raw)
3003 self.assertGreaterEqual(decoded[1], 0)
3004 self.assertLessEqual(decoded[1], len(raw))
3005 else:
3006 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01003007 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02003008
3009 def check_encode(self, cp, tests):
3010 for text, errors, expected in tests:
3011 if expected is not None:
3012 try:
3013 encoded = codecs.code_page_encode(cp, text, errors)
3014 except UnicodeEncodeError as err:
3015 self.fail('Unable to encode %a to "cp%s" with '
3016 'errors=%r: %s' % (text, cp, errors, err))
3017 self.assertEqual(encoded[0], expected,
3018 '%a.encode("cp%s", %r)=%a != %a'
3019 % (text, cp, errors, encoded[0], expected))
3020 self.assertEqual(encoded[1], len(text))
3021 else:
3022 self.assertRaises(UnicodeEncodeError,
3023 codecs.code_page_encode, cp, text, errors)
3024
3025 def test_cp932(self):
3026 self.check_encode(932, (
3027 ('abc', 'strict', b'abc'),
3028 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003029 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003030 ('\xff', 'strict', None),
3031 ('[\xff]', 'ignore', b'[]'),
3032 ('[\xff]', 'replace', b'[y]'),
3033 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003034 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003035 ('[\xff]', 'namereplace',
3036 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003037 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003038 ('\udcff', 'strict', None),
3039 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3040 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003041 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003042 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003043 (b'abc', 'strict', 'abc'),
3044 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3045 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003046 (b'[\xff]', 'strict', None),
3047 (b'[\xff]', 'ignore', '[]'),
3048 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003049 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003050 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003051 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003052 (b'\x81\x00abc', 'strict', None),
3053 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003054 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003055 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003056 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003057
3058 def test_cp1252(self):
3059 self.check_encode(1252, (
3060 ('abc', 'strict', b'abc'),
3061 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3062 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003063 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003064 ('\u0141', 'strict', None),
3065 ('\u0141', 'ignore', b''),
3066 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003067 ('\udc98', 'surrogateescape', b'\x98'),
3068 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003069 ))
3070 self.check_decode(1252, (
3071 (b'abc', 'strict', 'abc'),
3072 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3073 (b'\xff', 'strict', '\xff'),
3074 ))
3075
3076 def test_cp_utf7(self):
3077 cp = 65000
3078 self.check_encode(cp, (
3079 ('abc', 'strict', b'abc'),
3080 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3081 ('\U0010ffff', 'strict', b'+2//f/w-'),
3082 ('\udc80', 'strict', b'+3IA-'),
3083 ('\ufffd', 'strict', b'+//0-'),
3084 ))
3085 self.check_decode(cp, (
3086 (b'abc', 'strict', 'abc'),
3087 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3088 (b'+2//f/w-', 'strict', '\U0010ffff'),
3089 (b'+3IA-', 'strict', '\udc80'),
3090 (b'+//0-', 'strict', '\ufffd'),
3091 # invalid bytes
3092 (b'[+/]', 'strict', '[]'),
3093 (b'[\xff]', 'strict', '[\xff]'),
3094 ))
3095
Victor Stinner3a50e702011-10-18 21:21:00 +02003096 def test_multibyte_encoding(self):
3097 self.check_decode(932, (
3098 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3099 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3100 ))
3101 self.check_decode(self.CP_UTF8, (
3102 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3103 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3104 ))
Steve Dowerf5aba582016-09-06 19:42:27 -07003105 self.check_encode(self.CP_UTF8, (
3106 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3107 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3108 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003109
3110 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003111 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3112 self.assertEqual(decoded, ('', 0))
3113
Victor Stinner3a50e702011-10-18 21:21:00 +02003114 decoded = codecs.code_page_decode(932,
3115 b'\xe9\x80\xe9', 'strict',
3116 False)
3117 self.assertEqual(decoded, ('\u9a3e', 2))
3118
3119 decoded = codecs.code_page_decode(932,
3120 b'\xe9\x80\xe9\x80', 'strict',
3121 False)
3122 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3123
3124 decoded = codecs.code_page_decode(932,
3125 b'abc', 'strict',
3126 False)
3127 self.assertEqual(decoded, ('abc', 3))
3128
Steve Dowerf5aba582016-09-06 19:42:27 -07003129 def test_mbcs_alias(self):
3130 # Check that looking up our 'default' codepage will return
3131 # mbcs when we don't have a more specific one available
3132 import _bootlocale
3133 def _get_fake_codepage(*a):
3134 return 'cp123'
3135 old_getpreferredencoding = _bootlocale.getpreferredencoding
3136 _bootlocale.getpreferredencoding = _get_fake_codepage
3137 try:
3138 codec = codecs.lookup('cp123')
3139 self.assertEqual(codec.name, 'mbcs')
3140 finally:
3141 _bootlocale.getpreferredencoding = old_getpreferredencoding
3142
Victor Stinner3a50e702011-10-18 21:21:00 +02003143
Victor Stinnerf96418d2015-09-21 23:06:27 +02003144class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003145 def test_encode(self):
3146 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3147
3148 def test_encode_error(self):
3149 for data, error_handler, expected in (
3150 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3151 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3152 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003153 ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3154 b'[\\x80\\xff\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003155 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3156 ):
3157 with self.subTest(data=data, error_handler=error_handler,
3158 expected=expected):
3159 self.assertEqual(data.encode('ascii', error_handler),
3160 expected)
3161
3162 def test_encode_surrogateescape_error(self):
3163 with self.assertRaises(UnicodeEncodeError):
3164 # the first character can be decoded, but not the second
3165 '\udc80\xff'.encode('ascii', 'surrogateescape')
3166
Victor Stinnerf96418d2015-09-21 23:06:27 +02003167 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003168 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3169
3170 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003171 for data, error_handler, expected in (
3172 (b'[\x80\xff]', 'ignore', '[]'),
3173 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3174 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3175 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3176 ):
3177 with self.subTest(data=data, error_handler=error_handler,
3178 expected=expected):
3179 self.assertEqual(data.decode('ascii', error_handler),
3180 expected)
3181
3182
Victor Stinnerc3713e92015-09-29 12:32:13 +02003183class Latin1Test(unittest.TestCase):
3184 def test_encode(self):
3185 for data, expected in (
3186 ('abc', b'abc'),
3187 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3188 ):
3189 with self.subTest(data=data, expected=expected):
3190 self.assertEqual(data.encode('latin1'), expected)
3191
3192 def test_encode_errors(self):
3193 for data, error_handler, expected in (
3194 ('[\u20ac\udc80]', 'ignore', b'[]'),
3195 ('[\u20ac\udc80]', 'replace', b'[??]'),
Victor Stinner797485e2015-10-09 03:17:30 +02003196 ('[\u20ac\U000abcde]', 'backslashreplace',
3197 b'[\\u20ac\\U000abcde]'),
Victor Stinnerc3713e92015-09-29 12:32:13 +02003198 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3199 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3200 ):
3201 with self.subTest(data=data, error_handler=error_handler,
3202 expected=expected):
3203 self.assertEqual(data.encode('latin1', error_handler),
3204 expected)
3205
3206 def test_encode_surrogateescape_error(self):
3207 with self.assertRaises(UnicodeEncodeError):
3208 # the first character can be decoded, but not the second
3209 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3210
3211 def test_decode(self):
3212 for data, expected in (
3213 (b'abc', 'abc'),
3214 (b'[\x80\xff]', '[\x80\xff]'),
3215 ):
3216 with self.subTest(data=data, expected=expected):
3217 self.assertEqual(data.decode('latin1'), expected)
3218
3219
Fred Drake2e2be372001-09-20 21:33:42 +00003220if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003221 unittest.main()