blob: fb3db77a92e2e35a01f5205a288adffa627513c3 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200352 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200381 backslashreplace = ''.join('\\x%02x' % b
382 for b in self.ill_formed_sequence)
383 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
384 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200385
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200386class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000387 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388 if sys.byteorder == 'little':
389 ill_formed_sequence = b"\x80\xdc\x00\x00"
390 else:
391 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000392
393 spamle = (b'\xff\xfe\x00\x00'
394 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
395 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
396 spambe = (b'\x00\x00\xfe\xff'
397 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
398 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
399
400 def test_only_one_bom(self):
401 _,_,reader,writer = codecs.lookup(self.encoding)
402 # encode some stream
403 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200404 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000405 f.write("spam")
406 f.write("spam")
407 d = s.getvalue()
408 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000409 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410 # try to read it back
411 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200412 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000413 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414
415 def test_badbom(self):
416 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 def test_partial(self):
425 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200426 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000427 [
428 "", # first byte of BOM read
429 "", # second byte of BOM read
430 "", # third byte of BOM read
431 "", # fourth byte of BOM read => byteorder known
432 "",
433 "",
434 "",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100",
447 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200448 "\x00\xff\u0100\uffff",
449 "\x00\xff\u0100\uffff",
450 "\x00\xff\u0100\uffff",
451 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452 ]
453 )
454
Georg Brandl791f4e12009-09-17 11:41:24 +0000455 def test_handlers(self):
456 self.assertEqual(('\ufffd', 1),
457 codecs.utf_32_decode(b'\x01', 'replace', True))
458 self.assertEqual(('', 1),
459 codecs.utf_32_decode(b'\x01', 'ignore', True))
460
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 def test_errors(self):
462 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
463 b"\xff", "strict", True)
464
465 def test_decoder_state(self):
466 self.check_state_handling_decode(self.encoding,
467 "spamspam", self.spamle)
468 self.check_state_handling_decode(self.encoding,
469 "spamspam", self.spambe)
470
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000471 def test_issue8941(self):
472 # Issue #8941: insufficient result allocation when decoding into
473 # surrogate pairs on UCS-2 builds.
474 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
475 self.assertEqual('\U00010000' * 1024,
476 codecs.utf_32_decode(encoded_le)[0])
477 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_decode(encoded_be)[0])
480
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200481class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200483 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484
485 def test_partial(self):
486 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200487 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000488 [
489 "",
490 "",
491 "",
492 "\x00",
493 "\x00",
494 "\x00",
495 "\x00",
496 "\x00\xff",
497 "\x00\xff",
498 "\x00\xff",
499 "\x00\xff",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff",
508 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509 ]
510 )
511
512 def test_simple(self):
513 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
514
515 def test_errors(self):
516 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
517 b"\xff", "strict", True)
518
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000519 def test_issue8941(self):
520 # Issue #8941: insufficient result allocation when decoding into
521 # surrogate pairs on UCS-2 builds.
522 encoded = b'\x00\x00\x01\x00' * 1024
523 self.assertEqual('\U00010000' * 1024,
524 codecs.utf_32_le_decode(encoded)[0])
525
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200526class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200528 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529
530 def test_partial(self):
531 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200532 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 [
534 "",
535 "",
536 "",
537 "\x00",
538 "\x00",
539 "\x00",
540 "\x00",
541 "\x00\xff",
542 "\x00\xff",
543 "\x00\xff",
544 "\x00\xff",
545 "\x00\xff\u0100",
546 "\x00\xff\u0100",
547 "\x00\xff\u0100",
548 "\x00\xff\u0100",
549 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff",
551 "\x00\xff\u0100\uffff",
552 "\x00\xff\u0100\uffff",
553 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000554 ]
555 )
556
557 def test_simple(self):
558 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
559
560 def test_errors(self):
561 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
562 b"\xff", "strict", True)
563
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000564 def test_issue8941(self):
565 # Issue #8941: insufficient result allocation when decoding into
566 # surrogate pairs on UCS-2 builds.
567 encoded = b'\x00\x01\x00\x00' * 1024
568 self.assertEqual('\U00010000' * 1024,
569 codecs.utf_32_be_decode(encoded)[0])
570
571
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200572class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000573 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200574 if sys.byteorder == 'little':
575 ill_formed_sequence = b"\x80\xdc"
576 else:
577 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000579 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
580 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000581
582 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000585 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200586 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 f.write("spam")
588 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000589 d = s.getvalue()
590 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000591 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000593 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200594 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000595 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000596
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000597 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000602 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200603 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000604 self.assertRaises(UnicodeError, f.read)
605
Walter Dörwald69652032004-09-07 20:24:22 +0000606 def test_partial(self):
607 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200608 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000609 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000610 "", # first byte of BOM read
611 "", # second byte of BOM read => byteorder known
612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Georg Brandl791f4e12009-09-17 11:41:24 +0000627 def test_handlers(self):
628 self.assertEqual(('\ufffd', 1),
629 codecs.utf_16_decode(b'\x01', 'replace', True))
630 self.assertEqual(('', 1),
631 codecs.utf_16_decode(b'\x01', 'ignore', True))
632
Walter Dörwalde22d3392005-11-17 08:52:34 +0000633 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000635 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636
637 def test_decoder_state(self):
638 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000643 def test_bug691291(self):
644 # Files are always opened in binary mode, even if no binary mode was
645 # specified. This means that no automatic conversion of '\n' is done
646 # on reading and writing.
647 s1 = 'Hello\r\nworld\r\n'
648
649 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.addCleanup(support.unlink, support.TESTFN)
651 with open(support.TESTFN, 'wb') as fp:
652 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200653 with support.check_warnings(('', DeprecationWarning)):
654 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
655 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000657
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200658class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000659 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200660 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000661
662 def test_partial(self):
663 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200664 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000666 "",
667 "\x00",
668 "\x00",
669 "\x00\xff",
670 "\x00\xff",
671 "\x00\xff\u0100",
672 "\x00\xff\u0100",
673 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200674 "\x00\xff\u0100\uffff",
675 "\x00\xff\u0100\uffff",
676 "\x00\xff\u0100\uffff",
677 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000678 ]
679 )
680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200682 tests = [
683 (b'\xff', '\ufffd'),
684 (b'A\x00Z', 'A\ufffd'),
685 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
686 (b'\x00\xd8', '\ufffd'),
687 (b'\x00\xd8A', '\ufffd'),
688 (b'\x00\xd8A\x00', '\ufffdA'),
689 (b'\x00\xdcA\x00', '\ufffdA'),
690 ]
691 for raw, expected in tests:
692 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
693 raw, 'strict', True)
694 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
Victor Stinner53a9dd72010-12-08 22:25:45 +0000696 def test_nonbmp(self):
697 self.assertEqual("\U00010203".encode(self.encoding),
698 b'\x00\xd8\x03\xde')
699 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
700 "\U00010203")
701
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200702class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000703 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200704 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000705
706 def test_partial(self):
707 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200708 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000709 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000710 "",
711 "\x00",
712 "\x00",
713 "\x00\xff",
714 "\x00\xff",
715 "\x00\xff\u0100",
716 "\x00\xff\u0100",
717 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200718 "\x00\xff\u0100\uffff",
719 "\x00\xff\u0100\uffff",
720 "\x00\xff\u0100\uffff",
721 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000722 ]
723 )
724
Walter Dörwalde22d3392005-11-17 08:52:34 +0000725 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200726 tests = [
727 (b'\xff', '\ufffd'),
728 (b'\x00A\xff', 'A\ufffd'),
729 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
730 (b'\xd8\x00', '\ufffd'),
731 (b'\xd8\x00\xdc', '\ufffd'),
732 (b'\xd8\x00\x00A', '\ufffdA'),
733 (b'\xdc\x00\x00A', '\ufffdA'),
734 ]
735 for raw, expected in tests:
736 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
737 raw, 'strict', True)
738 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000739
Victor Stinner53a9dd72010-12-08 22:25:45 +0000740 def test_nonbmp(self):
741 self.assertEqual("\U00010203".encode(self.encoding),
742 b'\xd8\x00\xde\x03')
743 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
744 "\U00010203")
745
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200746class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000747 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200748 ill_formed_sequence = b"\xed\xb2\x80"
749 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000750
751 def test_partial(self):
752 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200753 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000754 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 "\x00",
756 "\x00",
757 "\x00\xff",
758 "\x00\xff",
759 "\x00\xff\u07ff",
760 "\x00\xff\u07ff",
761 "\x00\xff\u07ff",
762 "\x00\xff\u07ff\u0800",
763 "\x00\xff\u07ff\u0800",
764 "\x00\xff\u07ff\u0800",
765 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200766 "\x00\xff\u07ff\u0800\uffff",
767 "\x00\xff\u07ff\u0800\uffff",
768 "\x00\xff\u07ff\u0800\uffff",
769 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000770 ]
771 )
772
Walter Dörwald3abcb012007-04-16 22:10:50 +0000773 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000775 self.check_state_handling_decode(self.encoding,
776 u, u.encode(self.encoding))
777
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200779 super().test_lone_surrogates()
780 # not sure if this is making sense for
781 # UTF-16 and UTF-32
782 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000783 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000784
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000785 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000786 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
787 b"abc\xed\xa0\x80def")
788 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
789 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
791 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
792 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
793 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000794 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700795 with self.assertRaises(UnicodeDecodeError):
796 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200797 with self.assertRaises(UnicodeDecodeError):
798 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000799
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200800@unittest.skipUnless(sys.platform == 'win32',
801 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200802class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200803 encoding = "cp65001"
804
805 def test_encode(self):
806 tests = [
807 ('abc', 'strict', b'abc'),
808 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
809 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
810 ]
811 if VISTA_OR_LATER:
812 tests.extend((
813 ('\udc80', 'strict', None),
814 ('\udc80', 'ignore', b''),
815 ('\udc80', 'replace', b'?'),
816 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200817 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200818 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
819 ))
820 else:
821 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
822 for text, errors, expected in tests:
823 if expected is not None:
824 try:
825 encoded = text.encode('cp65001', errors)
826 except UnicodeEncodeError as err:
827 self.fail('Unable to encode %a to cp65001 with '
828 'errors=%r: %s' % (text, errors, err))
829 self.assertEqual(encoded, expected,
830 '%a.encode("cp65001", %r)=%a != %a'
831 % (text, errors, encoded, expected))
832 else:
833 self.assertRaises(UnicodeEncodeError,
834 text.encode, "cp65001", errors)
835
836 def test_decode(self):
837 tests = [
838 (b'abc', 'strict', 'abc'),
839 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
840 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
841 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
842 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
843 # invalid bytes
844 (b'[\xff]', 'strict', None),
845 (b'[\xff]', 'ignore', '[]'),
846 (b'[\xff]', 'replace', '[\ufffd]'),
847 (b'[\xff]', 'surrogateescape', '[\udcff]'),
848 ]
849 if VISTA_OR_LATER:
850 tests.extend((
851 (b'[\xed\xb2\x80]', 'strict', None),
852 (b'[\xed\xb2\x80]', 'ignore', '[]'),
853 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
854 ))
855 else:
856 tests.extend((
857 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
858 ))
859 for raw, errors, expected in tests:
860 if expected is not None:
861 try:
862 decoded = raw.decode('cp65001', errors)
863 except UnicodeDecodeError as err:
864 self.fail('Unable to decode %a from cp65001 with '
865 'errors=%r: %s' % (raw, errors, err))
866 self.assertEqual(decoded, expected,
867 '%a.decode("cp65001", %r)=%a != %a'
868 % (raw, errors, decoded, expected))
869 else:
870 self.assertRaises(UnicodeDecodeError,
871 raw.decode, 'cp65001', errors)
872
873 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
874 def test_lone_surrogates(self):
875 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
876 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
877 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
878 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200879 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
880 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200881 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
882 b'[&#56448;]')
883 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
884 b'[\x80]')
885 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
886 b'[]')
887 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
888 b'[?]')
889
890 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
891 def test_surrogatepass_handler(self):
892 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
893 b"abc\xed\xa0\x80def")
894 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
895 "abc\ud800def")
896 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
897 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
898 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
899 "\U00010fff\uD800")
900 self.assertTrue(codecs.lookup_error("surrogatepass"))
901
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200902
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200903class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000904 encoding = "utf-7"
905
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000906 def test_partial(self):
907 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200908 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000909 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200910 'a',
911 'a',
912 'a+',
913 'a+-',
914 'a+-b',
915 'a+-b',
916 'a+-b',
917 'a+-b',
918 'a+-b',
919 'a+-b\x00',
920 'a+-b\x00c',
921 'a+-b\x00c',
922 'a+-b\x00c',
923 'a+-b\x00c',
924 'a+-b\x00c',
925 'a+-b\x00c\x80',
926 'a+-b\x00c\x80d',
927 'a+-b\x00c\x80d',
928 'a+-b\x00c\x80d',
929 'a+-b\x00c\x80d',
930 'a+-b\x00c\x80d',
931 'a+-b\x00c\x80d\u0100',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e',
936 'a+-b\x00c\x80d\u0100e',
937 'a+-b\x00c\x80d\u0100e',
938 'a+-b\x00c\x80d\u0100e',
939 'a+-b\x00c\x80d\u0100e',
940 'a+-b\x00c\x80d\u0100e\U00010000',
941 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000942 ]
943 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000944
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300945 def test_errors(self):
946 tests = [
947 (b'a\xffb', 'a\ufffdb'),
948 (b'a+IK', 'a\ufffd'),
949 (b'a+IK-b', 'a\ufffdb'),
950 (b'a+IK,b', 'a\ufffdb'),
951 (b'a+IKx', 'a\u20ac\ufffd'),
952 (b'a+IKx-b', 'a\u20ac\ufffdb'),
953 (b'a+IKwgr', 'a\u20ac\ufffd'),
954 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
955 (b'a+IKwgr,', 'a\u20ac\ufffd'),
956 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
957 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
958 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
959 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
960 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
961 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
962 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
963 ]
964 for raw, expected in tests:
965 with self.subTest(raw=raw):
966 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
967 raw, 'strict', True)
968 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
969
970 def test_nonbmp(self):
971 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
972 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
973 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
974
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200975 test_lone_surrogates = None
976
977
Walter Dörwalde22d3392005-11-17 08:52:34 +0000978class UTF16ExTest(unittest.TestCase):
979
980 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000981 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000982
983 def test_bad_args(self):
984 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
985
986class ReadBufferTest(unittest.TestCase):
987
988 def test_array(self):
989 import array
990 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000991 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000992 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000993 )
994
995 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000996 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000997
998 def test_bad_args(self):
999 self.assertRaises(TypeError, codecs.readbuffer_encode)
1000 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1001
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001002class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001003 encoding = "utf-8-sig"
1004
1005 def test_partial(self):
1006 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001007 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001008 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001009 "",
1010 "",
1011 "", # First BOM has been read and skipped
1012 "",
1013 "",
1014 "\ufeff", # Second BOM has been read and emitted
1015 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001016 "\ufeff\x00", # First byte of encoded "\xff" read
1017 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1018 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1019 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001020 "\ufeff\x00\xff\u07ff",
1021 "\ufeff\x00\xff\u07ff",
1022 "\ufeff\x00\xff\u07ff\u0800",
1023 "\ufeff\x00\xff\u07ff\u0800",
1024 "\ufeff\x00\xff\u07ff\u0800",
1025 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001026 "\ufeff\x00\xff\u07ff\u0800\uffff",
1027 "\ufeff\x00\xff\u07ff\u0800\uffff",
1028 "\ufeff\x00\xff\u07ff\u0800\uffff",
1029 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001030 ]
1031 )
1032
Thomas Wouters89f507f2006-12-13 04:49:30 +00001033 def test_bug1601501(self):
1034 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001035 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001036
Walter Dörwald3abcb012007-04-16 22:10:50 +00001037 def test_bom(self):
1038 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001039 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001040 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1041
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001042 def test_stream_bom(self):
1043 unistring = "ABC\u00A1\u2200XYZ"
1044 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1045
1046 reader = codecs.getreader("utf-8-sig")
1047 for sizehint in [None] + list(range(1, 11)) + \
1048 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001049 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001050 ostream = io.StringIO()
1051 while 1:
1052 if sizehint is not None:
1053 data = istream.read(sizehint)
1054 else:
1055 data = istream.read()
1056
1057 if not data:
1058 break
1059 ostream.write(data)
1060
1061 got = ostream.getvalue()
1062 self.assertEqual(got, unistring)
1063
1064 def test_stream_bare(self):
1065 unistring = "ABC\u00A1\u2200XYZ"
1066 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1067
1068 reader = codecs.getreader("utf-8-sig")
1069 for sizehint in [None] + list(range(1, 11)) + \
1070 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001071 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001072 ostream = io.StringIO()
1073 while 1:
1074 if sizehint is not None:
1075 data = istream.read(sizehint)
1076 else:
1077 data = istream.read()
1078
1079 if not data:
1080 break
1081 ostream.write(data)
1082
1083 got = ostream.getvalue()
1084 self.assertEqual(got, unistring)
1085
1086class EscapeDecodeTest(unittest.TestCase):
1087 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001088 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001089 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001090
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001091 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001092 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001093 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001094 b = bytes([b])
1095 if b != b'\\':
1096 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001097
1098 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001099 decode = codecs.escape_decode
1100 check = coding_checker(self, decode)
1101 check(b"[\\\n]", b"[]")
1102 check(br'[\"]', b'["]')
1103 check(br"[\']", b"[']")
1104 check(br"[\\]", br"[\]")
1105 check(br"[\a]", b"[\x07]")
1106 check(br"[\b]", b"[\x08]")
1107 check(br"[\t]", b"[\x09]")
1108 check(br"[\n]", b"[\x0a]")
1109 check(br"[\v]", b"[\x0b]")
1110 check(br"[\f]", b"[\x0c]")
1111 check(br"[\r]", b"[\x0d]")
1112 check(br"[\7]", b"[\x07]")
1113 check(br"[\8]", br"[\8]")
1114 check(br"[\78]", b"[\x078]")
1115 check(br"[\41]", b"[!]")
1116 check(br"[\418]", b"[!8]")
1117 check(br"[\101]", b"[A]")
1118 check(br"[\1010]", b"[A0]")
1119 check(br"[\501]", b"[A]")
1120 check(br"[\x41]", b"[A]")
1121 check(br"[\X41]", br"[\X41]")
1122 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001123 for b in range(256):
1124 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001125 b = bytes([b])
1126 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001127
1128 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001129 decode = codecs.escape_decode
1130 self.assertRaises(ValueError, decode, br"\x")
1131 self.assertRaises(ValueError, decode, br"[\x]")
1132 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1133 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1134 self.assertRaises(ValueError, decode, br"\x0")
1135 self.assertRaises(ValueError, decode, br"[\x0]")
1136 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1137 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001138
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001139class RecodingTest(unittest.TestCase):
1140 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001141 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001142 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001143 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001144 f2.close()
1145 # Python used to crash on this at exit because of a refcount
1146 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001147
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001148 self.assertTrue(f.closed)
1149
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150# From RFC 3492
1151punycode_testcases = [
1152 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1154 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001155 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001158 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001161 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001162 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1164 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1165 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001166 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001167 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1169 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1170 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001171 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001174 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1175 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1176 "\u0939\u0948\u0902",
1177 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
1179 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001181 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1182 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183
1184 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1186 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1187 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001188 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1189 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001190
1191 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1193 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1194 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1195 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001196 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197
1198 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001199 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1200 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1201 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1202 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1203 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001204 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205
1206 # (K) Vietnamese:
1207 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1208 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1210 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1211 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1212 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001213 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001216 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001217 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001218
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1221 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1222 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224
1225 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1227 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1228 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001229 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230
1231 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001233 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234
1235 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001236 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1237 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001238 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239
1240 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001242 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001243
1244 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001245 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001246 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247
1248 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1250 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001251 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252 ]
1253
1254for i in punycode_testcases:
1255 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001256 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257
1258class PunycodeTest(unittest.TestCase):
1259 def test_encode(self):
1260 for uni, puny in punycode_testcases:
1261 # Need to convert both strings to lower case, since
1262 # some of the extended encodings use upper case, but our
1263 # code produces only lower case. Converting just puny to
1264 # lower is also insufficient, since some of the input characters
1265 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001266 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001267 str(uni.encode("punycode"), "ascii").lower(),
1268 str(puny, "ascii").lower()
1269 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270
1271 def test_decode(self):
1272 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001273 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001274 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001275 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001277class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001278 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001279 def test_bug1251300(self):
1280 # Decoding with unicode_internal used to not correctly handle "code
1281 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001282 ok = [
1283 (b"\x00\x10\xff\xff", "\U0010ffff"),
1284 (b"\x00\x00\x01\x01", "\U00000101"),
1285 (b"", ""),
1286 ]
1287 not_ok = [
1288 b"\x7f\xff\xff\xff",
1289 b"\x80\x00\x00\x00",
1290 b"\x81\x00\x00\x00",
1291 b"\x00",
1292 b"\x00\x00\x00\x00\x00",
1293 ]
1294 for internal, uni in ok:
1295 if sys.byteorder == "little":
1296 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001297 with support.check_warnings():
1298 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001299 for internal in not_ok:
1300 if sys.byteorder == "little":
1301 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001302 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001303 'deprecated', DeprecationWarning)):
1304 self.assertRaises(UnicodeDecodeError, internal.decode,
1305 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001306 if sys.byteorder == "little":
1307 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001308 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001309 else:
1310 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001311 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001312 with support.check_warnings():
1313 self.assertRaises(UnicodeDecodeError,
1314 invalid.decode, "unicode_internal")
1315 with support.check_warnings():
1316 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1317 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001318 with support.check_warnings():
1319 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1320 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001321
Victor Stinner182d90d2011-09-29 19:53:55 +02001322 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001323 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001324 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001325 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001326 'deprecated', DeprecationWarning)):
1327 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001328 except UnicodeDecodeError as ex:
1329 self.assertEqual("unicode_internal", ex.encoding)
1330 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1331 self.assertEqual(4, ex.start)
1332 self.assertEqual(8, ex.end)
1333 else:
1334 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001335
Victor Stinner182d90d2011-09-29 19:53:55 +02001336 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001337 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001338 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1339 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001340 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001341 'deprecated', DeprecationWarning)):
1342 ab = "ab".encode("unicode_internal").decode()
1343 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1344 "ascii"),
1345 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001346 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001347
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001348 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001349 with support.check_warnings(('unicode_internal codec has been '
1350 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001351 # Issue 3739
1352 encoder = codecs.getencoder("unicode_internal")
1353 self.assertEqual(encoder("a")[1], 1)
1354 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1355
1356 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001357
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1359nameprep_tests = [
1360 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1362 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1363 b'\xb8\x8f\xef\xbb\xbf',
1364 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'CAFE',
1367 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 # 3.3 Case folding 8bit U+00DF (german sharp s).
1369 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'\xc3\x9f',
1371 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'\xc4\xb0',
1374 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xc5\x83\xcd\xba',
1377 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1379 # XXX: skip this as it fails in UCS-2 mode
1380 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1381 # 'telc\xe2\x88\x95kg\xcf\x83'),
1382 (None, None),
1383 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1385 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xe1\xbe\xb7',
1388 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 # 3.9 Self-reverting case folding U+01F0 and normalization.
1390 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001391 (b'\xc7\xb0',
1392 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b'\xce\x90',
1395 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'\xce\xb0',
1398 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xe1\xba\x96',
1401 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001403 (b'\xe1\xbd\x96',
1404 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001406 (b' ',
1407 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 (b'\xc2\xa0',
1410 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001412 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 None),
1414 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001415 (b'\xe2\x80\x80',
1416 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001418 (b'\xe2\x80\x8b',
1419 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001421 (b'\xe3\x80\x80',
1422 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001424 (b'\x10\x7f',
1425 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001427 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001428 None),
1429 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001430 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001431 None),
1432 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001433 (b'\xef\xbb\xbf',
1434 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001436 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 None),
1438 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001439 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001440 None),
1441 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001442 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001443 None),
1444 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001445 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001446 None),
1447 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001448 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001449 None),
1450 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001451 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001452 None),
1453 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001454 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 None),
1456 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001458 None),
1459 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001460 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001461 None),
1462 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001463 (b'\xcd\x81',
1464 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001466 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 None),
1468 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001469 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 None),
1471 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001472 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 None),
1474 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001475 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 None),
1477 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001478 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 None),
1480 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 None),
1483 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 (b'foo\xef\xb9\xb6bar',
1485 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001488 None),
1489 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001490 (b'\xd8\xa71\xd8\xa8',
1491 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001493 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001495 # None),
1496 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 # 3.44 Larger test (shrinking).
1498 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001499 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1500 b'\xaa\xce\xb0\xe2\x80\x80',
1501 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001502 # 3.45 Larger test (expanding).
1503 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1505 b'\x80',
1506 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1507 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1508 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 ]
1510
1511
1512class NameprepTest(unittest.TestCase):
1513 def test_nameprep(self):
1514 from encodings.idna import nameprep
1515 for pos, (orig, prepped) in enumerate(nameprep_tests):
1516 if orig is None:
1517 # Skipped
1518 continue
1519 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001520 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001521 if prepped is None:
1522 # Input contains prohibited characters
1523 self.assertRaises(UnicodeError, nameprep, orig)
1524 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001525 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001526 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001527 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001528 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001529 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001530
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001531class IDNACodecTest(unittest.TestCase):
1532 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(str(b"python.org", "idna"), "python.org")
1534 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1535 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1536 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001537
1538 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001539 self.assertEqual("python.org".encode("idna"), b"python.org")
1540 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1541 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1542 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001543
Martin v. Löwis8b595142005-08-25 11:03:38 +00001544 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001545 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001546 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001548
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001549 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001551 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001552 "python.org"
1553 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001554 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001555 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001556 "python.org."
1557 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001558 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001559 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001560 "pyth\xf6n.org."
1561 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001563 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001564 "pyth\xf6n.org."
1565 )
1566
1567 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001568 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1569 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1570 self.assertEqual(decoder.decode(b"rg"), "")
1571 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572
1573 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001574 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1575 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1576 self.assertEqual(decoder.decode(b"rg."), "org.")
1577 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001578
1579 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001581 b"".join(codecs.iterencode("python.org", "idna")),
1582 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001583 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001584 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001585 b"".join(codecs.iterencode("python.org.", "idna")),
1586 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001587 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001588 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001589 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1590 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001592 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001593 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1594 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001595 )
1596
1597 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001598 self.assertEqual(encoder.encode("\xe4x"), b"")
1599 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1600 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001601
1602 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001603 self.assertEqual(encoder.encode("\xe4x"), b"")
1604 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1605 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001606
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001607 def test_errors(self):
1608 """Only supports "strict" error handler"""
1609 "python.org".encode("idna", "strict")
1610 b"python.org".decode("idna", "strict")
1611 for errors in ("ignore", "replace", "backslashreplace",
1612 "surrogateescape"):
1613 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1614 self.assertRaises(Exception,
1615 b"python.org".decode, "idna", errors)
1616
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001617class CodecsModuleTest(unittest.TestCase):
1618
1619 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001620 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1621 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001622 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001623 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001624 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001625
Victor Stinnera57dfd02014-05-14 17:13:14 +02001626 # test keywords
1627 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1628 '\xe4\xf6\xfc')
1629 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1630 '[]')
1631
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001632 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001633 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1634 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001635 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001636 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001637 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001638 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001639
Victor Stinnera57dfd02014-05-14 17:13:14 +02001640 # test keywords
1641 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1642 b'\xe4\xf6\xfc')
1643 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1644 b'[]')
1645
Walter Dörwald063e1e82004-10-28 13:04:26 +00001646 def test_register(self):
1647 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001648 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001649
1650 def test_lookup(self):
1651 self.assertRaises(TypeError, codecs.lookup)
1652 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001653 self.assertRaises(LookupError, codecs.lookup, " ")
1654
1655 def test_getencoder(self):
1656 self.assertRaises(TypeError, codecs.getencoder)
1657 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1658
1659 def test_getdecoder(self):
1660 self.assertRaises(TypeError, codecs.getdecoder)
1661 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1662
1663 def test_getreader(self):
1664 self.assertRaises(TypeError, codecs.getreader)
1665 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1666
1667 def test_getwriter(self):
1668 self.assertRaises(TypeError, codecs.getwriter)
1669 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001670
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001671 def test_lookup_issue1813(self):
1672 # Issue #1813: under Turkish locales, lookup of some codecs failed
1673 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001674 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001675 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1676 try:
1677 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1678 except locale.Error:
1679 # Unsupported locale on this system
1680 self.skipTest('test needs Turkish locale')
1681 c = codecs.lookup('ASCII')
1682 self.assertEqual(c.name, 'ascii')
1683
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001684 def test_all(self):
1685 api = (
1686 "encode", "decode",
1687 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1688 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1689 "getencoder", "getdecoder", "getincrementalencoder",
1690 "getincrementaldecoder", "getreader", "getwriter",
1691 "register_error", "lookup_error",
1692 "strict_errors", "replace_errors", "ignore_errors",
1693 "xmlcharrefreplace_errors", "backslashreplace_errors",
1694 "namereplace_errors",
1695 "open", "EncodedFile",
1696 "iterencode", "iterdecode",
1697 "BOM", "BOM_BE", "BOM_LE",
1698 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1699 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1700 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1701 "StreamReaderWriter", "StreamRecoder",
1702 )
1703 self.assertCountEqual(api, codecs.__all__)
1704 for api in codecs.__all__:
1705 getattr(codecs, api)
1706
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001707 def test_open(self):
1708 self.addCleanup(support.unlink, support.TESTFN)
1709 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1710 with self.subTest(mode), \
1711 codecs.open(support.TESTFN, mode, 'ascii') as file:
1712 self.assertIsInstance(file, codecs.StreamReaderWriter)
1713
1714 def test_undefined(self):
1715 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1716 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1717 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1718 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1719 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1720 self.assertRaises(UnicodeError,
1721 codecs.encode, 'abc', 'undefined', errors)
1722 self.assertRaises(UnicodeError,
1723 codecs.decode, b'abc', 'undefined', errors)
1724
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001725class StreamReaderTest(unittest.TestCase):
1726
1727 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001728 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001729 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001730
1731 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001732 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001733 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001734
Thomas Wouters89f507f2006-12-13 04:49:30 +00001735class EncodedFileTest(unittest.TestCase):
1736
1737 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001738 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001739 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001740 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001741
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001742 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001743 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001744 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001745 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001746
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001747all_unicode_encodings = [
1748 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749 "big5",
1750 "big5hkscs",
1751 "charmap",
1752 "cp037",
1753 "cp1006",
1754 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001755 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001756 "cp1140",
1757 "cp1250",
1758 "cp1251",
1759 "cp1252",
1760 "cp1253",
1761 "cp1254",
1762 "cp1255",
1763 "cp1256",
1764 "cp1257",
1765 "cp1258",
1766 "cp424",
1767 "cp437",
1768 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001769 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001770 "cp737",
1771 "cp775",
1772 "cp850",
1773 "cp852",
1774 "cp855",
1775 "cp856",
1776 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001777 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001778 "cp860",
1779 "cp861",
1780 "cp862",
1781 "cp863",
1782 "cp864",
1783 "cp865",
1784 "cp866",
1785 "cp869",
1786 "cp874",
1787 "cp875",
1788 "cp932",
1789 "cp949",
1790 "cp950",
1791 "euc_jis_2004",
1792 "euc_jisx0213",
1793 "euc_jp",
1794 "euc_kr",
1795 "gb18030",
1796 "gb2312",
1797 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001798 "hp_roman8",
1799 "hz",
1800 "idna",
1801 "iso2022_jp",
1802 "iso2022_jp_1",
1803 "iso2022_jp_2",
1804 "iso2022_jp_2004",
1805 "iso2022_jp_3",
1806 "iso2022_jp_ext",
1807 "iso2022_kr",
1808 "iso8859_1",
1809 "iso8859_10",
1810 "iso8859_11",
1811 "iso8859_13",
1812 "iso8859_14",
1813 "iso8859_15",
1814 "iso8859_16",
1815 "iso8859_2",
1816 "iso8859_3",
1817 "iso8859_4",
1818 "iso8859_5",
1819 "iso8859_6",
1820 "iso8859_7",
1821 "iso8859_8",
1822 "iso8859_9",
1823 "johab",
1824 "koi8_r",
1825 "koi8_u",
1826 "latin_1",
1827 "mac_cyrillic",
1828 "mac_greek",
1829 "mac_iceland",
1830 "mac_latin2",
1831 "mac_roman",
1832 "mac_turkish",
1833 "palmos",
1834 "ptcp154",
1835 "punycode",
1836 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001837 "shift_jis",
1838 "shift_jis_2004",
1839 "shift_jisx0213",
1840 "tis_620",
1841 "unicode_escape",
1842 "unicode_internal",
1843 "utf_16",
1844 "utf_16_be",
1845 "utf_16_le",
1846 "utf_7",
1847 "utf_8",
1848]
1849
1850if hasattr(codecs, "mbcs_encode"):
1851 all_unicode_encodings.append("mbcs")
1852
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001853# The following encoding is not tested, because it's not supposed
1854# to work:
1855# "undefined"
1856
1857# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001858broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001859 "punycode",
1860 "unicode_internal"
1861]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001862
Walter Dörwald3abcb012007-04-16 22:10:50 +00001863class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001864 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001865 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001866 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001867 name = codecs.lookup(encoding).name
1868 if encoding.endswith("_codec"):
1869 name += "_codec"
1870 elif encoding == "latin_1":
1871 name = "latin_1"
1872 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001873
Ezio Melottiadc417c2011-11-17 12:23:34 +02001874 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001875 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001876 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001877 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001878 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001879 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001880
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001881 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001882 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001883 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001884 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001885 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886 for c in s:
1887 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001888 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001889 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001890 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001891 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001892 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001893 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001894 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001895 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001896 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001897 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001898
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001899 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001900 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001901 try:
1902 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001903 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001904 pass
1905 else:
1906 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001907 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001908 for c in s:
1909 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001910 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001911 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001912 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001913 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001914 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001915 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001916 self.assertEqual(decodedresult, s,
1917 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001918
1919 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001920 result = "".join(codecs.iterdecode(
1921 codecs.iterencode(s, encoding), encoding))
1922 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001923
1924 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001925 result = "".join(codecs.iterdecode(
1926 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001927 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001928
Victor Stinner554f3f02010-06-16 23:33:54 +00001929 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001930 # check incremental decoder/encoder with errors argument
1931 try:
1932 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001933 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001934 pass
1935 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001936 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001937 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001938 decodedresult = "".join(decoder.decode(bytes([c]))
1939 for c in encodedresult)
1940 self.assertEqual(decodedresult, s,
1941 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001942
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001943 @support.cpython_only
1944 def test_basics_capi(self):
1945 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1946 s = "abc123" # all codecs should be able to encode these
1947 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001948 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001949 # check incremental decoder/encoder (fetched via the C API)
1950 try:
1951 cencoder = codec_incrementalencoder(encoding)
1952 except LookupError: # no IncrementalEncoder
1953 pass
1954 else:
1955 # check C API
1956 encodedresult = b""
1957 for c in s:
1958 encodedresult += cencoder.encode(c)
1959 encodedresult += cencoder.encode("", True)
1960 cdecoder = codec_incrementaldecoder(encoding)
1961 decodedresult = ""
1962 for c in encodedresult:
1963 decodedresult += cdecoder.decode(bytes([c]))
1964 decodedresult += cdecoder.decode(b"", True)
1965 self.assertEqual(decodedresult, s,
1966 "encoding=%r" % encoding)
1967
1968 if encoding not in ("idna", "mbcs"):
1969 # check incremental decoder/encoder with errors argument
1970 try:
1971 cencoder = codec_incrementalencoder(encoding, "ignore")
1972 except LookupError: # no IncrementalEncoder
1973 pass
1974 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001975 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001976 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1977 decodedresult = "".join(cdecoder.decode(bytes([c]))
1978 for c in encodedresult)
1979 self.assertEqual(decodedresult, s,
1980 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001981
Walter Dörwald729c31f2005-03-14 19:06:30 +00001982 def test_seek(self):
1983 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001984 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001985 for encoding in all_unicode_encodings:
1986 if encoding == "idna": # FIXME: See SF bug #1163178
1987 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001988 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001989 continue
Victor Stinner05010702011-05-27 16:50:40 +02001990 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001991 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001992 # Test that calling seek resets the internal codec state and buffers
1993 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001994 data = reader.read()
1995 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001996
Walter Dörwalde22d3392005-11-17 08:52:34 +00001997 def test_bad_decode_args(self):
1998 for encoding in all_unicode_encodings:
1999 decoder = codecs.getdecoder(encoding)
2000 self.assertRaises(TypeError, decoder)
2001 if encoding not in ("idna", "punycode"):
2002 self.assertRaises(TypeError, decoder, 42)
2003
2004 def test_bad_encode_args(self):
2005 for encoding in all_unicode_encodings:
2006 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002007 with support.check_warnings():
2008 # unicode-internal has been deprecated
2009 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002010
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002011 def test_encoding_map_type_initialized(self):
2012 from encodings import cp1140
2013 # This used to crash, we are only verifying there's no crash.
2014 table_type = type(cp1140.encoding_table)
2015 self.assertEqual(table_type, table_type)
2016
Walter Dörwald3abcb012007-04-16 22:10:50 +00002017 def test_decoder_state(self):
2018 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002019 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002020 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002021 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002022 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2023 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2024
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002025class CharmapTest(unittest.TestCase):
2026 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002027 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002028 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002029 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002030 )
2031
Ezio Melottib3aedd42010-11-20 19:04:17 +00002032 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002033 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2034 ("\U0010FFFFbc", 3)
2035 )
2036
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002037 self.assertRaises(UnicodeDecodeError,
2038 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2039 )
2040
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002041 self.assertRaises(UnicodeDecodeError,
2042 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2043 )
2044
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002045 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002046 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002047 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002048 )
2049
Ezio Melottib3aedd42010-11-20 19:04:17 +00002050 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002051 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002052 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002053 )
2054
Ezio Melottib3aedd42010-11-20 19:04:17 +00002055 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002056 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2057 ("ab\\x02", 3)
2058 )
2059
2060 self.assertEqual(
2061 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2062 ("ab\\x02", 3)
2063 )
2064
2065 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002066 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002067 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002068 )
2069
Ezio Melottib3aedd42010-11-20 19:04:17 +00002070 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002071 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002072 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002073 )
2074
Guido van Rossum805365e2007-05-07 22:24:25 +00002075 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002076 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002077 codecs.charmap_decode(allbytes, "ignore", ""),
2078 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002079 )
2080
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002081 def test_decode_with_int2str_map(self):
2082 self.assertEqual(
2083 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2084 {0: 'a', 1: 'b', 2: 'c'}),
2085 ("abc", 3)
2086 )
2087
2088 self.assertEqual(
2089 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2090 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2091 ("AaBbCc", 3)
2092 )
2093
2094 self.assertEqual(
2095 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2096 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2097 ("\U0010FFFFbc", 3)
2098 )
2099
2100 self.assertEqual(
2101 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2102 {0: 'a', 1: 'b', 2: ''}),
2103 ("ab", 3)
2104 )
2105
2106 self.assertRaises(UnicodeDecodeError,
2107 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2108 {0: 'a', 1: 'b'}
2109 )
2110
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002111 self.assertRaises(UnicodeDecodeError,
2112 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2113 {0: 'a', 1: 'b', 2: None}
2114 )
2115
2116 # Issue #14850
2117 self.assertRaises(UnicodeDecodeError,
2118 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2119 {0: 'a', 1: 'b', 2: '\ufffe'}
2120 )
2121
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002122 self.assertEqual(
2123 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2124 {0: 'a', 1: 'b'}),
2125 ("ab\ufffd", 3)
2126 )
2127
2128 self.assertEqual(
2129 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2130 {0: 'a', 1: 'b', 2: None}),
2131 ("ab\ufffd", 3)
2132 )
2133
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002134 # Issue #14850
2135 self.assertEqual(
2136 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2137 {0: 'a', 1: 'b', 2: '\ufffe'}),
2138 ("ab\ufffd", 3)
2139 )
2140
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002141 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002142 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2143 {0: 'a', 1: 'b'}),
2144 ("ab\\x02", 3)
2145 )
2146
2147 self.assertEqual(
2148 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2149 {0: 'a', 1: 'b', 2: None}),
2150 ("ab\\x02", 3)
2151 )
2152
2153 # Issue #14850
2154 self.assertEqual(
2155 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2156 {0: 'a', 1: 'b', 2: '\ufffe'}),
2157 ("ab\\x02", 3)
2158 )
2159
2160 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002161 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2162 {0: 'a', 1: 'b'}),
2163 ("ab", 3)
2164 )
2165
2166 self.assertEqual(
2167 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2168 {0: 'a', 1: 'b', 2: None}),
2169 ("ab", 3)
2170 )
2171
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002172 # Issue #14850
2173 self.assertEqual(
2174 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2175 {0: 'a', 1: 'b', 2: '\ufffe'}),
2176 ("ab", 3)
2177 )
2178
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002179 allbytes = bytes(range(256))
2180 self.assertEqual(
2181 codecs.charmap_decode(allbytes, "ignore", {}),
2182 ("", len(allbytes))
2183 )
2184
2185 def test_decode_with_int2int_map(self):
2186 a = ord('a')
2187 b = ord('b')
2188 c = ord('c')
2189
2190 self.assertEqual(
2191 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2192 {0: a, 1: b, 2: c}),
2193 ("abc", 3)
2194 )
2195
2196 # Issue #15379
2197 self.assertEqual(
2198 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2199 {0: 0x10FFFF, 1: b, 2: c}),
2200 ("\U0010FFFFbc", 3)
2201 )
2202
Antoine Pitroua1f76552012-09-23 20:00:04 +02002203 self.assertEqual(
2204 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2205 {0: sys.maxunicode, 1: b, 2: c}),
2206 (chr(sys.maxunicode) + "bc", 3)
2207 )
2208
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002209 self.assertRaises(TypeError,
2210 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002211 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002212 )
2213
2214 self.assertRaises(UnicodeDecodeError,
2215 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2216 {0: a, 1: b},
2217 )
2218
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002219 self.assertRaises(UnicodeDecodeError,
2220 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2221 {0: a, 1: b, 2: 0xFFFE},
2222 )
2223
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002224 self.assertEqual(
2225 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2226 {0: a, 1: b}),
2227 ("ab\ufffd", 3)
2228 )
2229
2230 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002231 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2232 {0: a, 1: b, 2: 0xFFFE}),
2233 ("ab\ufffd", 3)
2234 )
2235
2236 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002237 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2238 {0: a, 1: b}),
2239 ("ab\\x02", 3)
2240 )
2241
2242 self.assertEqual(
2243 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2244 {0: a, 1: b, 2: 0xFFFE}),
2245 ("ab\\x02", 3)
2246 )
2247
2248 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002249 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2250 {0: a, 1: b}),
2251 ("ab", 3)
2252 )
2253
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002254 self.assertEqual(
2255 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2256 {0: a, 1: b, 2: 0xFFFE}),
2257 ("ab", 3)
2258 )
2259
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002260
Thomas Wouters89f507f2006-12-13 04:49:30 +00002261class WithStmtTest(unittest.TestCase):
2262 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002263 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002264 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2265 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002266 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002267
2268 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002269 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002270 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002271 with codecs.StreamReaderWriter(f, info.streamreader,
2272 info.streamwriter, 'strict') as srw:
2273 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002274
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002275class TypesTest(unittest.TestCase):
2276 def test_decode_unicode(self):
2277 # Most decoders don't accept unicode input
2278 decoders = [
2279 codecs.utf_7_decode,
2280 codecs.utf_8_decode,
2281 codecs.utf_16_le_decode,
2282 codecs.utf_16_be_decode,
2283 codecs.utf_16_ex_decode,
2284 codecs.utf_32_decode,
2285 codecs.utf_32_le_decode,
2286 codecs.utf_32_be_decode,
2287 codecs.utf_32_ex_decode,
2288 codecs.latin_1_decode,
2289 codecs.ascii_decode,
2290 codecs.charmap_decode,
2291 ]
2292 if hasattr(codecs, "mbcs_decode"):
2293 decoders.append(codecs.mbcs_decode)
2294 for decoder in decoders:
2295 self.assertRaises(TypeError, decoder, "xxx")
2296
2297 def test_unicode_escape(self):
2298 # Escape-decoding an unicode string is supported ang gives the same
2299 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002300 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2301 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2302 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2303 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002304
Victor Stinnere3b47152011-12-09 20:49:49 +01002305 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2306 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002307 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2308 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002309
2310 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2311 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002312 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2313 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002314
Serhiy Storchakad6793772013-01-29 10:20:44 +02002315
2316class UnicodeEscapeTest(unittest.TestCase):
2317 def test_empty(self):
2318 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2319 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2320
2321 def test_raw_encode(self):
2322 encode = codecs.unicode_escape_encode
2323 for b in range(32, 127):
2324 if b != b'\\'[0]:
2325 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2326
2327 def test_raw_decode(self):
2328 decode = codecs.unicode_escape_decode
2329 for b in range(256):
2330 if b != b'\\'[0]:
2331 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2332
2333 def test_escape_encode(self):
2334 encode = codecs.unicode_escape_encode
2335 check = coding_checker(self, encode)
2336 check('\t', br'\t')
2337 check('\n', br'\n')
2338 check('\r', br'\r')
2339 check('\\', br'\\')
2340 for b in range(32):
2341 if chr(b) not in '\t\n\r':
2342 check(chr(b), ('\\x%02x' % b).encode())
2343 for b in range(127, 256):
2344 check(chr(b), ('\\x%02x' % b).encode())
2345 check('\u20ac', br'\u20ac')
2346 check('\U0001d120', br'\U0001d120')
2347
2348 def test_escape_decode(self):
2349 decode = codecs.unicode_escape_decode
2350 check = coding_checker(self, decode)
2351 check(b"[\\\n]", "[]")
2352 check(br'[\"]', '["]')
2353 check(br"[\']", "[']")
2354 check(br"[\\]", r"[\]")
2355 check(br"[\a]", "[\x07]")
2356 check(br"[\b]", "[\x08]")
2357 check(br"[\t]", "[\x09]")
2358 check(br"[\n]", "[\x0a]")
2359 check(br"[\v]", "[\x0b]")
2360 check(br"[\f]", "[\x0c]")
2361 check(br"[\r]", "[\x0d]")
2362 check(br"[\7]", "[\x07]")
2363 check(br"[\8]", r"[\8]")
2364 check(br"[\78]", "[\x078]")
2365 check(br"[\41]", "[!]")
2366 check(br"[\418]", "[!8]")
2367 check(br"[\101]", "[A]")
2368 check(br"[\1010]", "[A0]")
2369 check(br"[\x41]", "[A]")
2370 check(br"[\x410]", "[A0]")
2371 check(br"\u20ac", "\u20ac")
2372 check(br"\U0001d120", "\U0001d120")
2373 for b in range(256):
2374 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2375 check(b'\\' + bytes([b]), '\\' + chr(b))
2376
2377 def test_decode_errors(self):
2378 decode = codecs.unicode_escape_decode
2379 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2380 for i in range(d):
2381 self.assertRaises(UnicodeDecodeError, decode,
2382 b"\\" + c + b"0"*i)
2383 self.assertRaises(UnicodeDecodeError, decode,
2384 b"[\\" + c + b"0"*i + b"]")
2385 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2386 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2387 self.assertEqual(decode(data, "replace"),
2388 ("[\ufffd]\ufffd", len(data)))
2389 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2390 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2391 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2392
2393
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002394class RawUnicodeEscapeTest(unittest.TestCase):
2395 def test_empty(self):
2396 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2397 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2398
2399 def test_raw_encode(self):
2400 encode = codecs.raw_unicode_escape_encode
2401 for b in range(256):
2402 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2403
2404 def test_raw_decode(self):
2405 decode = codecs.raw_unicode_escape_decode
2406 for b in range(256):
2407 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2408
2409 def test_escape_encode(self):
2410 encode = codecs.raw_unicode_escape_encode
2411 check = coding_checker(self, encode)
2412 for b in range(256):
2413 if b not in b'uU':
2414 check('\\' + chr(b), b'\\' + bytes([b]))
2415 check('\u20ac', br'\u20ac')
2416 check('\U0001d120', br'\U0001d120')
2417
2418 def test_escape_decode(self):
2419 decode = codecs.raw_unicode_escape_decode
2420 check = coding_checker(self, decode)
2421 for b in range(256):
2422 if b not in b'uU':
2423 check(b'\\' + bytes([b]), '\\' + chr(b))
2424 check(br"\u20ac", "\u20ac")
2425 check(br"\U0001d120", "\U0001d120")
2426
2427 def test_decode_errors(self):
2428 decode = codecs.raw_unicode_escape_decode
2429 for c, d in (b'u', 4), (b'U', 4):
2430 for i in range(d):
2431 self.assertRaises(UnicodeDecodeError, decode,
2432 b"\\" + c + b"0"*i)
2433 self.assertRaises(UnicodeDecodeError, decode,
2434 b"[\\" + c + b"0"*i + b"]")
2435 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2436 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2437 self.assertEqual(decode(data, "replace"),
2438 ("[\ufffd]\ufffd", len(data)))
2439 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2440 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2441 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2442
2443
Martin v. Löwis43c57782009-05-10 08:15:24 +00002444class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002445
2446 def test_utf8(self):
2447 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002448 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002449 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002450 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002451 b"foo\x80bar")
2452 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002453 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002454 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002455 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002456 b"\xed\xb0\x80")
2457
2458 def test_ascii(self):
2459 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002460 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002461 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002462 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002463 b"foo\x80bar")
2464
2465 def test_charmap(self):
2466 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002467 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002468 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002469 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002470 b"foo\xa5bar")
2471
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002472 def test_latin1(self):
2473 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002474 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002475 b"\xe4\xeb\xef\xf6\xfc")
2476
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002477
Victor Stinner3fed0872010-05-22 02:16:27 +00002478class BomTest(unittest.TestCase):
2479 def test_seek0(self):
2480 data = "1234567890"
2481 tests = ("utf-16",
2482 "utf-16-le",
2483 "utf-16-be",
2484 "utf-32",
2485 "utf-32-le",
2486 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002487 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002488 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002489 # Check if the BOM is written only once
2490 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002491 f.write(data)
2492 f.write(data)
2493 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002494 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002495 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002496 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002497
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002498 # Check that the BOM is written after a seek(0)
2499 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2500 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002501 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002502 f.seek(0)
2503 f.write(data)
2504 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002505 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002506
2507 # (StreamWriter) Check that the BOM is written after a seek(0)
2508 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002509 f.writer.write(data[0])
2510 self.assertNotEqual(f.writer.tell(), 0)
2511 f.writer.seek(0)
2512 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002513 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002514 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002515
Victor Stinner05010702011-05-27 16:50:40 +02002516 # Check that the BOM is not written after a seek() at a position
2517 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002518 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2519 f.write(data)
2520 f.seek(f.tell())
2521 f.write(data)
2522 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002523 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002524
Victor Stinner05010702011-05-27 16:50:40 +02002525 # (StreamWriter) Check that the BOM is not written after a seek()
2526 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002527 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002528 f.writer.write(data)
2529 f.writer.seek(f.writer.tell())
2530 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002531 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002532 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002533
Victor Stinner3fed0872010-05-22 02:16:27 +00002534
Georg Brandl02524622010-12-02 18:06:51 +00002535bytes_transform_encodings = [
2536 "base64_codec",
2537 "uu_codec",
2538 "quopri_codec",
2539 "hex_codec",
2540]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002541
2542transform_aliases = {
2543 "base64_codec": ["base64", "base_64"],
2544 "uu_codec": ["uu"],
2545 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2546 "hex_codec": ["hex"],
2547 "rot_13": ["rot13"],
2548}
2549
Georg Brandl02524622010-12-02 18:06:51 +00002550try:
2551 import zlib
2552except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002553 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002554else:
2555 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002556 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002557try:
2558 import bz2
2559except ImportError:
2560 pass
2561else:
2562 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002563 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002564
2565class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002566
Georg Brandl02524622010-12-02 18:06:51 +00002567 def test_basics(self):
2568 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002569 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002570 with self.subTest(encoding=encoding):
2571 # generic codecs interface
2572 (o, size) = codecs.getencoder(encoding)(binput)
2573 self.assertEqual(size, len(binput))
2574 (i, size) = codecs.getdecoder(encoding)(o)
2575 self.assertEqual(size, len(o))
2576 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002577
Georg Brandl02524622010-12-02 18:06:51 +00002578 def test_read(self):
2579 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002580 with self.subTest(encoding=encoding):
2581 sin = codecs.encode(b"\x80", encoding)
2582 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2583 sout = reader.read()
2584 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002585
2586 def test_readline(self):
2587 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002588 with self.subTest(encoding=encoding):
2589 sin = codecs.encode(b"\x80", encoding)
2590 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2591 sout = reader.readline()
2592 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002593
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002594 def test_buffer_api_usage(self):
2595 # We check all the transform codecs accept memoryview input
2596 # for encoding and decoding
2597 # and also that they roundtrip correctly
2598 original = b"12345\x80"
2599 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002600 with self.subTest(encoding=encoding):
2601 data = original
2602 view = memoryview(data)
2603 data = codecs.encode(data, encoding)
2604 view_encoded = codecs.encode(view, encoding)
2605 self.assertEqual(view_encoded, data)
2606 view = memoryview(data)
2607 data = codecs.decode(data, encoding)
2608 self.assertEqual(data, original)
2609 view_decoded = codecs.decode(view, encoding)
2610 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002611
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002612 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002613 # Check binary -> binary codecs give a good error for str input
2614 bad_input = "bad input type"
2615 for encoding in bytes_transform_encodings:
2616 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002617 fmt = ( "{!r} is not a text encoding; "
2618 "use codecs.encode\(\) to handle arbitrary codecs")
2619 msg = fmt.format(encoding)
2620 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002621 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002622 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002623
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002624 def test_text_to_binary_blacklists_text_transforms(self):
2625 # Check str.encode gives a good error message for str -> str codecs
2626 msg = (r"^'rot_13' is not a text encoding; "
2627 "use codecs.encode\(\) to handle arbitrary codecs")
2628 with self.assertRaisesRegex(LookupError, msg):
2629 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002630
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002631 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002632 # Check bytes.decode and bytearray.decode give a good error
2633 # message for binary -> binary codecs
2634 data = b"encode first to ensure we meet any format restrictions"
2635 for encoding in bytes_transform_encodings:
2636 with self.subTest(encoding=encoding):
2637 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002638 fmt = (r"{!r} is not a text encoding; "
2639 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002640 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002641 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002642 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002643 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002644 bytearray(encoded_data).decode(encoding)
2645
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002646 def test_binary_to_text_blacklists_text_transforms(self):
2647 # Check str -> str codec gives a good error for binary input
2648 for bad_input in (b"immutable", bytearray(b"mutable")):
2649 with self.subTest(bad_input=bad_input):
2650 msg = (r"^'rot_13' is not a text encoding; "
2651 "use codecs.decode\(\) to handle arbitrary codecs")
2652 with self.assertRaisesRegex(LookupError, msg) as failure:
2653 bad_input.decode("rot_13")
2654 self.assertIsNone(failure.exception.__cause__)
2655
Zachary Wareefa2e042013-12-30 14:54:11 -06002656 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002657 def test_custom_zlib_error_is_wrapped(self):
2658 # Check zlib codec gives a good error for malformed input
2659 msg = "^decoding with 'zlib_codec' codec failed"
2660 with self.assertRaisesRegex(Exception, msg) as failure:
2661 codecs.decode(b"hello", "zlib_codec")
2662 self.assertIsInstance(failure.exception.__cause__,
2663 type(failure.exception))
2664
2665 def test_custom_hex_error_is_wrapped(self):
2666 # Check hex codec gives a good error for malformed input
2667 msg = "^decoding with 'hex_codec' codec failed"
2668 with self.assertRaisesRegex(Exception, msg) as failure:
2669 codecs.decode(b"hello", "hex_codec")
2670 self.assertIsInstance(failure.exception.__cause__,
2671 type(failure.exception))
2672
2673 # Unfortunately, the bz2 module throws OSError, which the codec
2674 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002675
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002676 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2677 def test_aliases(self):
2678 for codec_name, aliases in transform_aliases.items():
2679 expected_name = codecs.lookup(codec_name).name
2680 for alias in aliases:
2681 with self.subTest(alias=alias):
2682 info = codecs.lookup(alias)
2683 self.assertEqual(info.name, expected_name)
2684
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002685 def test_uu_invalid(self):
2686 # Missing "begin" line
2687 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2688
Nick Coghlan8b097b42013-11-13 23:49:21 +10002689
2690# The codec system tries to wrap exceptions in order to ensure the error
2691# mentions the operation being performed and the codec involved. We
2692# currently *only* want this to happen for relatively stateless
2693# exceptions, where the only significant information they contain is their
2694# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002695
2696# Use a local codec registry to avoid appearing to leak objects when
2697# registering multiple seach functions
2698_TEST_CODECS = {}
2699
2700def _get_test_codec(codec_name):
2701 return _TEST_CODECS.get(codec_name)
2702codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2703
Nick Coghlan8fad1672014-09-15 23:50:44 +12002704try:
2705 # Issue #22166: Also need to clear the internal cache in CPython
2706 from _codecs import _forget_codec
2707except ImportError:
2708 def _forget_codec(codec_name):
2709 pass
2710
2711
Nick Coghlan8b097b42013-11-13 23:49:21 +10002712class ExceptionChainingTest(unittest.TestCase):
2713
2714 def setUp(self):
2715 # There's no way to unregister a codec search function, so we just
2716 # ensure we render this one fairly harmless after the test
2717 # case finishes by using the test case repr as the codec name
2718 # The codecs module normalizes codec names, although this doesn't
2719 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002720 # We also make sure we use a truly unique id for the custom codec
2721 # to avoid issues with the codec cache when running these tests
2722 # multiple times (e.g. when hunting for refleaks)
2723 unique_id = repr(self) + str(id(self))
2724 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2725
2726 # We store the object to raise on the instance because of a bad
2727 # interaction between the codec caching (which means we can't
2728 # recreate the codec entry) and regrtest refleak hunting (which
2729 # runs the same test instance multiple times). This means we
2730 # need to ensure the codecs call back in to the instance to find
2731 # out which exception to raise rather than binding them in a
2732 # closure to an object that may change on the next run
2733 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002734
Nick Coghlan4e553e22013-11-16 00:35:34 +10002735 def tearDown(self):
2736 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002737 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2738 encodings._cache.pop(self.codec_name, None)
2739 try:
2740 _forget_codec(self.codec_name)
2741 except KeyError:
2742 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002743
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002744 def set_codec(self, encode, decode):
2745 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002746 name=self.codec_name)
2747 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002748
2749 @contextlib.contextmanager
2750 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002751 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002752 operation, self.codec_name, exc_type.__name__, msg)
2753 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2754 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002755 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002756 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002757
2758 def raise_obj(self, *args, **kwds):
2759 # Helper to dynamically change the object raised by a test codec
2760 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002761
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002762 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002763 self.obj_to_raise = obj_to_raise
2764 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002765 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002766 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002767 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002769 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002770 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002771 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002772 codecs.decode(b"bytes input", self.codec_name)
2773
2774 def test_raise_by_type(self):
2775 self.check_wrapped(RuntimeError, "")
2776
2777 def test_raise_by_value(self):
2778 msg = "This should be wrapped"
2779 self.check_wrapped(RuntimeError(msg), msg)
2780
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002781 def test_raise_grandchild_subclass_exact_size(self):
2782 msg = "This should be wrapped"
2783 class MyRuntimeError(RuntimeError):
2784 __slots__ = ()
2785 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2786
2787 def test_raise_subclass_with_weakref_support(self):
2788 msg = "This should be wrapped"
2789 class MyRuntimeError(RuntimeError):
2790 pass
2791 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2792
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002793 def check_not_wrapped(self, obj_to_raise, msg):
2794 def raise_obj(*args, **kwds):
2795 raise obj_to_raise
2796 self.set_codec(raise_obj, raise_obj)
2797 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002798 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002799 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002800 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002801 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002802 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002803 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002804 codecs.decode(b"bytes input", self.codec_name)
2805
2806 def test_init_override_is_not_wrapped(self):
2807 class CustomInit(RuntimeError):
2808 def __init__(self):
2809 pass
2810 self.check_not_wrapped(CustomInit, "")
2811
2812 def test_new_override_is_not_wrapped(self):
2813 class CustomNew(RuntimeError):
2814 def __new__(cls):
2815 return super().__new__(cls)
2816 self.check_not_wrapped(CustomNew, "")
2817
2818 def test_instance_attribute_is_not_wrapped(self):
2819 msg = "This should NOT be wrapped"
2820 exc = RuntimeError(msg)
2821 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002822 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002823
2824 def test_non_str_arg_is_not_wrapped(self):
2825 self.check_not_wrapped(RuntimeError(1), "1")
2826
2827 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002828 msg_re = r"^\('a', 'b', 'c'\)$"
2829 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002830
2831 # http://bugs.python.org/issue19609
2832 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002833 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002834 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002835 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002836 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002837 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002838 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002839 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002840 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002841 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002842 codecs.decode(b"bytes input", self.codec_name)
2843
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002844 def test_unflagged_non_text_codec_handling(self):
2845 # The stdlib non-text codecs are now marked so they're
2846 # pre-emptively skipped by the text model related methods
2847 # However, third party codecs won't be flagged, so we still make
2848 # sure the case where an inappropriate output type is produced is
2849 # handled appropriately
2850 def encode_to_str(*args, **kwds):
2851 return "not bytes!", 0
2852 def decode_to_bytes(*args, **kwds):
2853 return b"not str!", 0
2854 self.set_codec(encode_to_str, decode_to_bytes)
2855 # No input or output type checks on the codecs module functions
2856 encoded = codecs.encode(None, self.codec_name)
2857 self.assertEqual(encoded, "not bytes!")
2858 decoded = codecs.decode(None, self.codec_name)
2859 self.assertEqual(decoded, b"not str!")
2860 # Text model methods should complain
2861 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2862 "use codecs.encode\(\) to encode to arbitrary types$")
2863 msg = fmt.format(self.codec_name)
2864 with self.assertRaisesRegex(TypeError, msg):
2865 "str_input".encode(self.codec_name)
2866 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2867 "use codecs.decode\(\) to decode to arbitrary types$")
2868 msg = fmt.format(self.codec_name)
2869 with self.assertRaisesRegex(TypeError, msg):
2870 b"bytes input".decode(self.codec_name)
2871
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002872
Georg Brandl02524622010-12-02 18:06:51 +00002873
Victor Stinner62be4fb2011-10-18 21:46:37 +02002874@unittest.skipUnless(sys.platform == 'win32',
2875 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002876class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002877 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002878 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002879
Victor Stinner3a50e702011-10-18 21:21:00 +02002880 def test_invalid_code_page(self):
2881 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2882 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002883 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2884 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002885
2886 def test_code_page_name(self):
2887 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2888 codecs.code_page_encode, 932, '\xff')
2889 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002890 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002891 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002892 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002893
2894 def check_decode(self, cp, tests):
2895 for raw, errors, expected in tests:
2896 if expected is not None:
2897 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002898 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002899 except UnicodeDecodeError as err:
2900 self.fail('Unable to decode %a from "cp%s" with '
2901 'errors=%r: %s' % (raw, cp, errors, err))
2902 self.assertEqual(decoded[0], expected,
2903 '%a.decode("cp%s", %r)=%a != %a'
2904 % (raw, cp, errors, decoded[0], expected))
2905 # assert 0 <= decoded[1] <= len(raw)
2906 self.assertGreaterEqual(decoded[1], 0)
2907 self.assertLessEqual(decoded[1], len(raw))
2908 else:
2909 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002910 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002911
2912 def check_encode(self, cp, tests):
2913 for text, errors, expected in tests:
2914 if expected is not None:
2915 try:
2916 encoded = codecs.code_page_encode(cp, text, errors)
2917 except UnicodeEncodeError as err:
2918 self.fail('Unable to encode %a to "cp%s" with '
2919 'errors=%r: %s' % (text, cp, errors, err))
2920 self.assertEqual(encoded[0], expected,
2921 '%a.encode("cp%s", %r)=%a != %a'
2922 % (text, cp, errors, encoded[0], expected))
2923 self.assertEqual(encoded[1], len(text))
2924 else:
2925 self.assertRaises(UnicodeEncodeError,
2926 codecs.code_page_encode, cp, text, errors)
2927
2928 def test_cp932(self):
2929 self.check_encode(932, (
2930 ('abc', 'strict', b'abc'),
2931 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002932 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002933 ('\xff', 'strict', None),
2934 ('[\xff]', 'ignore', b'[]'),
2935 ('[\xff]', 'replace', b'[y]'),
2936 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002937 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002938 ('[\xff]', 'namereplace',
2939 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002940 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002941 ('\udcff', 'strict', None),
2942 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2943 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002944 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002945 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002946 (b'abc', 'strict', 'abc'),
2947 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2948 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002949 (b'[\xff]', 'strict', None),
2950 (b'[\xff]', 'ignore', '[]'),
2951 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002952 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002953 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002954 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002955 (b'\x81\x00abc', 'strict', None),
2956 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002957 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002958 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002959 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002960
2961 def test_cp1252(self):
2962 self.check_encode(1252, (
2963 ('abc', 'strict', b'abc'),
2964 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2965 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002966 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002967 ('\u0141', 'strict', None),
2968 ('\u0141', 'ignore', b''),
2969 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002970 ('\udc98', 'surrogateescape', b'\x98'),
2971 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002972 ))
2973 self.check_decode(1252, (
2974 (b'abc', 'strict', 'abc'),
2975 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2976 (b'\xff', 'strict', '\xff'),
2977 ))
2978
2979 def test_cp_utf7(self):
2980 cp = 65000
2981 self.check_encode(cp, (
2982 ('abc', 'strict', b'abc'),
2983 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2984 ('\U0010ffff', 'strict', b'+2//f/w-'),
2985 ('\udc80', 'strict', b'+3IA-'),
2986 ('\ufffd', 'strict', b'+//0-'),
2987 ))
2988 self.check_decode(cp, (
2989 (b'abc', 'strict', 'abc'),
2990 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2991 (b'+2//f/w-', 'strict', '\U0010ffff'),
2992 (b'+3IA-', 'strict', '\udc80'),
2993 (b'+//0-', 'strict', '\ufffd'),
2994 # invalid bytes
2995 (b'[+/]', 'strict', '[]'),
2996 (b'[\xff]', 'strict', '[\xff]'),
2997 ))
2998
Victor Stinner3a50e702011-10-18 21:21:00 +02002999 def test_multibyte_encoding(self):
3000 self.check_decode(932, (
3001 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3002 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3003 ))
3004 self.check_decode(self.CP_UTF8, (
3005 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3006 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3007 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003008 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003009 self.check_encode(self.CP_UTF8, (
3010 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3011 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3012 ))
3013
3014 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003015 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3016 self.assertEqual(decoded, ('', 0))
3017
Victor Stinner3a50e702011-10-18 21:21:00 +02003018 decoded = codecs.code_page_decode(932,
3019 b'\xe9\x80\xe9', 'strict',
3020 False)
3021 self.assertEqual(decoded, ('\u9a3e', 2))
3022
3023 decoded = codecs.code_page_decode(932,
3024 b'\xe9\x80\xe9\x80', 'strict',
3025 False)
3026 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3027
3028 decoded = codecs.code_page_decode(932,
3029 b'abc', 'strict',
3030 False)
3031 self.assertEqual(decoded, ('abc', 3))
3032
3033
Fred Drake2e2be372001-09-20 21:33:42 +00003034if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003035 unittest.main()