blob: 353a8509dd876bf8931c7d5f58e87c85404d54e2 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200352 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200381 backslashreplace = ''.join('\\x%02x' % b
382 for b in self.ill_formed_sequence)
383 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
384 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200385
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200386class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000387 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388 if sys.byteorder == 'little':
389 ill_formed_sequence = b"\x80\xdc\x00\x00"
390 else:
391 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000392
393 spamle = (b'\xff\xfe\x00\x00'
394 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
395 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
396 spambe = (b'\x00\x00\xfe\xff'
397 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
398 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
399
400 def test_only_one_bom(self):
401 _,_,reader,writer = codecs.lookup(self.encoding)
402 # encode some stream
403 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200404 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000405 f.write("spam")
406 f.write("spam")
407 d = s.getvalue()
408 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000409 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410 # try to read it back
411 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200412 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000413 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414
415 def test_badbom(self):
416 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 def test_partial(self):
425 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200426 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000427 [
428 "", # first byte of BOM read
429 "", # second byte of BOM read
430 "", # third byte of BOM read
431 "", # fourth byte of BOM read => byteorder known
432 "",
433 "",
434 "",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100",
447 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200448 "\x00\xff\u0100\uffff",
449 "\x00\xff\u0100\uffff",
450 "\x00\xff\u0100\uffff",
451 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452 ]
453 )
454
Georg Brandl791f4e12009-09-17 11:41:24 +0000455 def test_handlers(self):
456 self.assertEqual(('\ufffd', 1),
457 codecs.utf_32_decode(b'\x01', 'replace', True))
458 self.assertEqual(('', 1),
459 codecs.utf_32_decode(b'\x01', 'ignore', True))
460
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 def test_errors(self):
462 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
463 b"\xff", "strict", True)
464
465 def test_decoder_state(self):
466 self.check_state_handling_decode(self.encoding,
467 "spamspam", self.spamle)
468 self.check_state_handling_decode(self.encoding,
469 "spamspam", self.spambe)
470
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000471 def test_issue8941(self):
472 # Issue #8941: insufficient result allocation when decoding into
473 # surrogate pairs on UCS-2 builds.
474 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
475 self.assertEqual('\U00010000' * 1024,
476 codecs.utf_32_decode(encoded_le)[0])
477 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_decode(encoded_be)[0])
480
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200481class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200483 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484
485 def test_partial(self):
486 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200487 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000488 [
489 "",
490 "",
491 "",
492 "\x00",
493 "\x00",
494 "\x00",
495 "\x00",
496 "\x00\xff",
497 "\x00\xff",
498 "\x00\xff",
499 "\x00\xff",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff",
508 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509 ]
510 )
511
512 def test_simple(self):
513 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
514
515 def test_errors(self):
516 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
517 b"\xff", "strict", True)
518
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000519 def test_issue8941(self):
520 # Issue #8941: insufficient result allocation when decoding into
521 # surrogate pairs on UCS-2 builds.
522 encoded = b'\x00\x00\x01\x00' * 1024
523 self.assertEqual('\U00010000' * 1024,
524 codecs.utf_32_le_decode(encoded)[0])
525
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200526class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200528 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529
530 def test_partial(self):
531 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200532 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 [
534 "",
535 "",
536 "",
537 "\x00",
538 "\x00",
539 "\x00",
540 "\x00",
541 "\x00\xff",
542 "\x00\xff",
543 "\x00\xff",
544 "\x00\xff",
545 "\x00\xff\u0100",
546 "\x00\xff\u0100",
547 "\x00\xff\u0100",
548 "\x00\xff\u0100",
549 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff",
551 "\x00\xff\u0100\uffff",
552 "\x00\xff\u0100\uffff",
553 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000554 ]
555 )
556
557 def test_simple(self):
558 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
559
560 def test_errors(self):
561 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
562 b"\xff", "strict", True)
563
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000564 def test_issue8941(self):
565 # Issue #8941: insufficient result allocation when decoding into
566 # surrogate pairs on UCS-2 builds.
567 encoded = b'\x00\x01\x00\x00' * 1024
568 self.assertEqual('\U00010000' * 1024,
569 codecs.utf_32_be_decode(encoded)[0])
570
571
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200572class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000573 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200574 if sys.byteorder == 'little':
575 ill_formed_sequence = b"\x80\xdc"
576 else:
577 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000579 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
580 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000581
582 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000585 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200586 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 f.write("spam")
588 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000589 d = s.getvalue()
590 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000591 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000593 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200594 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000595 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000596
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000597 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000602 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200603 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000604 self.assertRaises(UnicodeError, f.read)
605
Walter Dörwald69652032004-09-07 20:24:22 +0000606 def test_partial(self):
607 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200608 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000609 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000610 "", # first byte of BOM read
611 "", # second byte of BOM read => byteorder known
612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Georg Brandl791f4e12009-09-17 11:41:24 +0000627 def test_handlers(self):
628 self.assertEqual(('\ufffd', 1),
629 codecs.utf_16_decode(b'\x01', 'replace', True))
630 self.assertEqual(('', 1),
631 codecs.utf_16_decode(b'\x01', 'ignore', True))
632
Walter Dörwalde22d3392005-11-17 08:52:34 +0000633 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000635 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636
637 def test_decoder_state(self):
638 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000643 def test_bug691291(self):
644 # Files are always opened in binary mode, even if no binary mode was
645 # specified. This means that no automatic conversion of '\n' is done
646 # on reading and writing.
647 s1 = 'Hello\r\nworld\r\n'
648
649 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.addCleanup(support.unlink, support.TESTFN)
651 with open(support.TESTFN, 'wb') as fp:
652 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200653 with support.check_warnings(('', DeprecationWarning)):
654 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
655 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000657
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200658class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000659 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200660 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000661
662 def test_partial(self):
663 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200664 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000666 "",
667 "\x00",
668 "\x00",
669 "\x00\xff",
670 "\x00\xff",
671 "\x00\xff\u0100",
672 "\x00\xff\u0100",
673 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200674 "\x00\xff\u0100\uffff",
675 "\x00\xff\u0100\uffff",
676 "\x00\xff\u0100\uffff",
677 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000678 ]
679 )
680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200682 tests = [
683 (b'\xff', '\ufffd'),
684 (b'A\x00Z', 'A\ufffd'),
685 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
686 (b'\x00\xd8', '\ufffd'),
687 (b'\x00\xd8A', '\ufffd'),
688 (b'\x00\xd8A\x00', '\ufffdA'),
689 (b'\x00\xdcA\x00', '\ufffdA'),
690 ]
691 for raw, expected in tests:
692 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
693 raw, 'strict', True)
694 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
Victor Stinner53a9dd72010-12-08 22:25:45 +0000696 def test_nonbmp(self):
697 self.assertEqual("\U00010203".encode(self.encoding),
698 b'\x00\xd8\x03\xde')
699 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
700 "\U00010203")
701
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200702class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000703 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200704 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000705
706 def test_partial(self):
707 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200708 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000709 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000710 "",
711 "\x00",
712 "\x00",
713 "\x00\xff",
714 "\x00\xff",
715 "\x00\xff\u0100",
716 "\x00\xff\u0100",
717 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200718 "\x00\xff\u0100\uffff",
719 "\x00\xff\u0100\uffff",
720 "\x00\xff\u0100\uffff",
721 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000722 ]
723 )
724
Walter Dörwalde22d3392005-11-17 08:52:34 +0000725 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200726 tests = [
727 (b'\xff', '\ufffd'),
728 (b'\x00A\xff', 'A\ufffd'),
729 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
730 (b'\xd8\x00', '\ufffd'),
731 (b'\xd8\x00\xdc', '\ufffd'),
732 (b'\xd8\x00\x00A', '\ufffdA'),
733 (b'\xdc\x00\x00A', '\ufffdA'),
734 ]
735 for raw, expected in tests:
736 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
737 raw, 'strict', True)
738 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000739
Victor Stinner53a9dd72010-12-08 22:25:45 +0000740 def test_nonbmp(self):
741 self.assertEqual("\U00010203".encode(self.encoding),
742 b'\xd8\x00\xde\x03')
743 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
744 "\U00010203")
745
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200746class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000747 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200748 ill_formed_sequence = b"\xed\xb2\x80"
749 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000750
751 def test_partial(self):
752 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200753 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000754 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 "\x00",
756 "\x00",
757 "\x00\xff",
758 "\x00\xff",
759 "\x00\xff\u07ff",
760 "\x00\xff\u07ff",
761 "\x00\xff\u07ff",
762 "\x00\xff\u07ff\u0800",
763 "\x00\xff\u07ff\u0800",
764 "\x00\xff\u07ff\u0800",
765 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200766 "\x00\xff\u07ff\u0800\uffff",
767 "\x00\xff\u07ff\u0800\uffff",
768 "\x00\xff\u07ff\u0800\uffff",
769 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000770 ]
771 )
772
Walter Dörwald3abcb012007-04-16 22:10:50 +0000773 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000775 self.check_state_handling_decode(self.encoding,
776 u, u.encode(self.encoding))
777
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200779 super().test_lone_surrogates()
780 # not sure if this is making sense for
781 # UTF-16 and UTF-32
782 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000783 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000784
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000785 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000786 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
787 b"abc\xed\xa0\x80def")
788 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
789 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
791 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
792 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
793 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000794 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700795 with self.assertRaises(UnicodeDecodeError):
796 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200797 with self.assertRaises(UnicodeDecodeError):
798 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000799
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200800@unittest.skipUnless(sys.platform == 'win32',
801 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200802class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200803 encoding = "cp65001"
804
805 def test_encode(self):
806 tests = [
807 ('abc', 'strict', b'abc'),
808 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
809 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
810 ]
811 if VISTA_OR_LATER:
812 tests.extend((
813 ('\udc80', 'strict', None),
814 ('\udc80', 'ignore', b''),
815 ('\udc80', 'replace', b'?'),
816 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200817 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200818 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
819 ))
820 else:
821 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
822 for text, errors, expected in tests:
823 if expected is not None:
824 try:
825 encoded = text.encode('cp65001', errors)
826 except UnicodeEncodeError as err:
827 self.fail('Unable to encode %a to cp65001 with '
828 'errors=%r: %s' % (text, errors, err))
829 self.assertEqual(encoded, expected,
830 '%a.encode("cp65001", %r)=%a != %a'
831 % (text, errors, encoded, expected))
832 else:
833 self.assertRaises(UnicodeEncodeError,
834 text.encode, "cp65001", errors)
835
836 def test_decode(self):
837 tests = [
838 (b'abc', 'strict', 'abc'),
839 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
840 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
841 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
842 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
843 # invalid bytes
844 (b'[\xff]', 'strict', None),
845 (b'[\xff]', 'ignore', '[]'),
846 (b'[\xff]', 'replace', '[\ufffd]'),
847 (b'[\xff]', 'surrogateescape', '[\udcff]'),
848 ]
849 if VISTA_OR_LATER:
850 tests.extend((
851 (b'[\xed\xb2\x80]', 'strict', None),
852 (b'[\xed\xb2\x80]', 'ignore', '[]'),
853 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
854 ))
855 else:
856 tests.extend((
857 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
858 ))
859 for raw, errors, expected in tests:
860 if expected is not None:
861 try:
862 decoded = raw.decode('cp65001', errors)
863 except UnicodeDecodeError as err:
864 self.fail('Unable to decode %a from cp65001 with '
865 'errors=%r: %s' % (raw, errors, err))
866 self.assertEqual(decoded, expected,
867 '%a.decode("cp65001", %r)=%a != %a'
868 % (raw, errors, decoded, expected))
869 else:
870 self.assertRaises(UnicodeDecodeError,
871 raw.decode, 'cp65001', errors)
872
873 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
874 def test_lone_surrogates(self):
875 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
876 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
877 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
878 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200879 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
880 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200881 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
882 b'[&#56448;]')
883 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
884 b'[\x80]')
885 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
886 b'[]')
887 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
888 b'[?]')
889
890 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
891 def test_surrogatepass_handler(self):
892 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
893 b"abc\xed\xa0\x80def")
894 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
895 "abc\ud800def")
896 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
897 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
898 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
899 "\U00010fff\uD800")
900 self.assertTrue(codecs.lookup_error("surrogatepass"))
901
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200902
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200903class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000904 encoding = "utf-7"
905
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000906 def test_partial(self):
907 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200908 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000909 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200910 'a',
911 'a',
912 'a+',
913 'a+-',
914 'a+-b',
915 'a+-b',
916 'a+-b',
917 'a+-b',
918 'a+-b',
919 'a+-b\x00',
920 'a+-b\x00c',
921 'a+-b\x00c',
922 'a+-b\x00c',
923 'a+-b\x00c',
924 'a+-b\x00c',
925 'a+-b\x00c\x80',
926 'a+-b\x00c\x80d',
927 'a+-b\x00c\x80d',
928 'a+-b\x00c\x80d',
929 'a+-b\x00c\x80d',
930 'a+-b\x00c\x80d',
931 'a+-b\x00c\x80d\u0100',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e',
936 'a+-b\x00c\x80d\u0100e',
937 'a+-b\x00c\x80d\u0100e',
938 'a+-b\x00c\x80d\u0100e',
939 'a+-b\x00c\x80d\u0100e',
940 'a+-b\x00c\x80d\u0100e\U00010000',
941 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000942 ]
943 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000944
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300945 def test_errors(self):
946 tests = [
947 (b'a\xffb', 'a\ufffdb'),
948 (b'a+IK', 'a\ufffd'),
949 (b'a+IK-b', 'a\ufffdb'),
950 (b'a+IK,b', 'a\ufffdb'),
951 (b'a+IKx', 'a\u20ac\ufffd'),
952 (b'a+IKx-b', 'a\u20ac\ufffdb'),
953 (b'a+IKwgr', 'a\u20ac\ufffd'),
954 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
955 (b'a+IKwgr,', 'a\u20ac\ufffd'),
956 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
957 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
958 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
959 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
960 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
961 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
962 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
963 ]
964 for raw, expected in tests:
965 with self.subTest(raw=raw):
966 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
967 raw, 'strict', True)
968 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
969
970 def test_nonbmp(self):
971 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
972 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
973 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
974
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200975 test_lone_surrogates = None
976
977
Walter Dörwalde22d3392005-11-17 08:52:34 +0000978class UTF16ExTest(unittest.TestCase):
979
980 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000981 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000982
983 def test_bad_args(self):
984 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
985
986class ReadBufferTest(unittest.TestCase):
987
988 def test_array(self):
989 import array
990 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000991 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000992 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000993 )
994
995 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000996 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000997
998 def test_bad_args(self):
999 self.assertRaises(TypeError, codecs.readbuffer_encode)
1000 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1001
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001002class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001003 encoding = "utf-8-sig"
1004
1005 def test_partial(self):
1006 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001007 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001008 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001009 "",
1010 "",
1011 "", # First BOM has been read and skipped
1012 "",
1013 "",
1014 "\ufeff", # Second BOM has been read and emitted
1015 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001016 "\ufeff\x00", # First byte of encoded "\xff" read
1017 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1018 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1019 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001020 "\ufeff\x00\xff\u07ff",
1021 "\ufeff\x00\xff\u07ff",
1022 "\ufeff\x00\xff\u07ff\u0800",
1023 "\ufeff\x00\xff\u07ff\u0800",
1024 "\ufeff\x00\xff\u07ff\u0800",
1025 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001026 "\ufeff\x00\xff\u07ff\u0800\uffff",
1027 "\ufeff\x00\xff\u07ff\u0800\uffff",
1028 "\ufeff\x00\xff\u07ff\u0800\uffff",
1029 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001030 ]
1031 )
1032
Thomas Wouters89f507f2006-12-13 04:49:30 +00001033 def test_bug1601501(self):
1034 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001035 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001036
Walter Dörwald3abcb012007-04-16 22:10:50 +00001037 def test_bom(self):
1038 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001039 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001040 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1041
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001042 def test_stream_bom(self):
1043 unistring = "ABC\u00A1\u2200XYZ"
1044 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1045
1046 reader = codecs.getreader("utf-8-sig")
1047 for sizehint in [None] + list(range(1, 11)) + \
1048 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001049 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001050 ostream = io.StringIO()
1051 while 1:
1052 if sizehint is not None:
1053 data = istream.read(sizehint)
1054 else:
1055 data = istream.read()
1056
1057 if not data:
1058 break
1059 ostream.write(data)
1060
1061 got = ostream.getvalue()
1062 self.assertEqual(got, unistring)
1063
1064 def test_stream_bare(self):
1065 unistring = "ABC\u00A1\u2200XYZ"
1066 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1067
1068 reader = codecs.getreader("utf-8-sig")
1069 for sizehint in [None] + list(range(1, 11)) + \
1070 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001071 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001072 ostream = io.StringIO()
1073 while 1:
1074 if sizehint is not None:
1075 data = istream.read(sizehint)
1076 else:
1077 data = istream.read()
1078
1079 if not data:
1080 break
1081 ostream.write(data)
1082
1083 got = ostream.getvalue()
1084 self.assertEqual(got, unistring)
1085
1086class EscapeDecodeTest(unittest.TestCase):
1087 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001088 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001089
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001090 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001091 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001092 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001093 b = bytes([b])
1094 if b != b'\\':
1095 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001096
1097 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001098 decode = codecs.escape_decode
1099 check = coding_checker(self, decode)
1100 check(b"[\\\n]", b"[]")
1101 check(br'[\"]', b'["]')
1102 check(br"[\']", b"[']")
1103 check(br"[\\]", br"[\]")
1104 check(br"[\a]", b"[\x07]")
1105 check(br"[\b]", b"[\x08]")
1106 check(br"[\t]", b"[\x09]")
1107 check(br"[\n]", b"[\x0a]")
1108 check(br"[\v]", b"[\x0b]")
1109 check(br"[\f]", b"[\x0c]")
1110 check(br"[\r]", b"[\x0d]")
1111 check(br"[\7]", b"[\x07]")
1112 check(br"[\8]", br"[\8]")
1113 check(br"[\78]", b"[\x078]")
1114 check(br"[\41]", b"[!]")
1115 check(br"[\418]", b"[!8]")
1116 check(br"[\101]", b"[A]")
1117 check(br"[\1010]", b"[A0]")
1118 check(br"[\501]", b"[A]")
1119 check(br"[\x41]", b"[A]")
1120 check(br"[\X41]", br"[\X41]")
1121 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001122 for b in range(256):
1123 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001124 b = bytes([b])
1125 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001126
1127 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001128 decode = codecs.escape_decode
1129 self.assertRaises(ValueError, decode, br"\x")
1130 self.assertRaises(ValueError, decode, br"[\x]")
1131 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1132 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1133 self.assertRaises(ValueError, decode, br"\x0")
1134 self.assertRaises(ValueError, decode, br"[\x0]")
1135 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1136 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001137
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001138class RecodingTest(unittest.TestCase):
1139 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001140 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001141 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001142 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001143 f2.close()
1144 # Python used to crash on this at exit because of a refcount
1145 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001146
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001147 self.assertTrue(f.closed)
1148
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149# From RFC 3492
1150punycode_testcases = [
1151 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1153 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001154 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001155 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001157 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001158 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001159 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001160 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1163 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1164 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001165 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001166 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1168 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1169 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001170 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001171 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001173 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1174 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1175 "\u0939\u0948\u0902",
1176 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001177
1178 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001179 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001180 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1181 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001182
1183 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001184 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1185 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1186 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001187 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1188 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189
1190 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001191 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1192 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1193 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1194 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001195 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196
1197 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001198 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1199 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1200 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1201 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1202 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001203 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001204
1205 # (K) Vietnamese:
1206 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1207 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001208 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1209 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1210 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1211 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001212 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001215 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001216 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001217
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001219 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1220 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1221 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001222 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223
1224 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1226 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1227 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001228 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229
1230 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001232 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233
1234 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001235 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1236 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001237 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238
1239 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001240 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001241 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242
1243 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001244 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001245 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246
1247 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001248 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1249 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001250 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 ]
1252
1253for i in punycode_testcases:
1254 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001255 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256
1257class PunycodeTest(unittest.TestCase):
1258 def test_encode(self):
1259 for uni, puny in punycode_testcases:
1260 # Need to convert both strings to lower case, since
1261 # some of the extended encodings use upper case, but our
1262 # code produces only lower case. Converting just puny to
1263 # lower is also insufficient, since some of the input characters
1264 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001265 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001266 str(uni.encode("punycode"), "ascii").lower(),
1267 str(puny, "ascii").lower()
1268 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001269
1270 def test_decode(self):
1271 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001272 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001273 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001274 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001276class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001277 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001278 def test_bug1251300(self):
1279 # Decoding with unicode_internal used to not correctly handle "code
1280 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001281 ok = [
1282 (b"\x00\x10\xff\xff", "\U0010ffff"),
1283 (b"\x00\x00\x01\x01", "\U00000101"),
1284 (b"", ""),
1285 ]
1286 not_ok = [
1287 b"\x7f\xff\xff\xff",
1288 b"\x80\x00\x00\x00",
1289 b"\x81\x00\x00\x00",
1290 b"\x00",
1291 b"\x00\x00\x00\x00\x00",
1292 ]
1293 for internal, uni in ok:
1294 if sys.byteorder == "little":
1295 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001296 with support.check_warnings():
1297 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001298 for internal in not_ok:
1299 if sys.byteorder == "little":
1300 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001301 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001302 'deprecated', DeprecationWarning)):
1303 self.assertRaises(UnicodeDecodeError, internal.decode,
1304 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001305 if sys.byteorder == "little":
1306 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001307 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001308 else:
1309 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001310 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001311 with support.check_warnings():
1312 self.assertRaises(UnicodeDecodeError,
1313 invalid.decode, "unicode_internal")
1314 with support.check_warnings():
1315 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1316 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001317 with support.check_warnings():
1318 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1319 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001320
Victor Stinner182d90d2011-09-29 19:53:55 +02001321 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001322 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001323 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001324 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001325 'deprecated', DeprecationWarning)):
1326 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001327 except UnicodeDecodeError as ex:
1328 self.assertEqual("unicode_internal", ex.encoding)
1329 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1330 self.assertEqual(4, ex.start)
1331 self.assertEqual(8, ex.end)
1332 else:
1333 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001334
Victor Stinner182d90d2011-09-29 19:53:55 +02001335 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001336 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001337 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1338 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001339 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001340 'deprecated', DeprecationWarning)):
1341 ab = "ab".encode("unicode_internal").decode()
1342 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1343 "ascii"),
1344 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001345 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001346
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001347 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001348 with support.check_warnings(('unicode_internal codec has been '
1349 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001350 # Issue 3739
1351 encoder = codecs.getencoder("unicode_internal")
1352 self.assertEqual(encoder("a")[1], 1)
1353 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1354
1355 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001356
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1358nameprep_tests = [
1359 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1361 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1362 b'\xb8\x8f\xef\xbb\xbf',
1363 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'CAFE',
1366 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 # 3.3 Case folding 8bit U+00DF (german sharp s).
1368 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001369 (b'\xc3\x9f',
1370 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001371 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'\xc4\xb0',
1373 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001375 (b'\xc5\x83\xcd\xba',
1376 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1378 # XXX: skip this as it fails in UCS-2 mode
1379 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1380 # 'telc\xe2\x88\x95kg\xcf\x83'),
1381 (None, None),
1382 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1384 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xe1\xbe\xb7',
1387 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.9 Self-reverting case folding U+01F0 and normalization.
1389 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xc7\xb0',
1391 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'\xce\x90',
1394 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b'\xce\xb0',
1397 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xe1\xba\x96',
1400 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xe1\xbd\x96',
1403 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b' ',
1406 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'\xc2\xa0',
1409 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 None),
1413 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\xe2\x80\x80',
1415 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xe2\x80\x8b',
1418 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001419 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xe3\x80\x80',
1421 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001422 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\x10\x7f',
1424 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 None),
1428 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 None),
1431 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001432 (b'\xef\xbb\xbf',
1433 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001434 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001435 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 None),
1437 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 None),
1440 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 None),
1443 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 None),
1446 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 None),
1449 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 None),
1452 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454 None),
1455 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 None),
1458 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xcd\x81',
1463 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001464 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 None),
1467 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 None),
1470 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 None),
1473 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 None),
1476 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 None),
1479 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001481 None),
1482 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 (b'foo\xef\xb9\xb6bar',
1484 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 None),
1488 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'\xd8\xa71\xd8\xa8',
1490 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001491 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001492 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001494 # None),
1495 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001496 # 3.44 Larger test (shrinking).
1497 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001498 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1499 b'\xaa\xce\xb0\xe2\x80\x80',
1500 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 # 3.45 Larger test (expanding).
1502 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001503 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1504 b'\x80',
1505 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1506 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1507 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001508 ]
1509
1510
1511class NameprepTest(unittest.TestCase):
1512 def test_nameprep(self):
1513 from encodings.idna import nameprep
1514 for pos, (orig, prepped) in enumerate(nameprep_tests):
1515 if orig is None:
1516 # Skipped
1517 continue
1518 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001519 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520 if prepped is None:
1521 # Input contains prohibited characters
1522 self.assertRaises(UnicodeError, nameprep, orig)
1523 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001524 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001525 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001526 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001527 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001528 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001529
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001530class IDNACodecTest(unittest.TestCase):
1531 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001532 self.assertEqual(str(b"python.org", "idna"), "python.org")
1533 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1534 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1535 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001536
1537 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual("python.org".encode("idna"), b"python.org")
1539 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1540 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1541 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001542
Martin v. Löwis8b595142005-08-25 11:03:38 +00001543 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001544 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001545 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001547
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001548 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001550 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001551 "python.org"
1552 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001553 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001554 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001555 "python.org."
1556 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001558 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001559 "pyth\xf6n.org."
1560 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001562 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001563 "pyth\xf6n.org."
1564 )
1565
1566 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001567 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1568 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1569 self.assertEqual(decoder.decode(b"rg"), "")
1570 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571
1572 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001573 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1574 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1575 self.assertEqual(decoder.decode(b"rg."), "org.")
1576 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001577
1578 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 b"".join(codecs.iterencode("python.org", "idna")),
1581 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001582 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001583 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001584 b"".join(codecs.iterencode("python.org.", "idna")),
1585 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001586 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001587 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001588 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1589 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001590 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001591 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001592 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1593 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001594 )
1595
1596 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001597 self.assertEqual(encoder.encode("\xe4x"), b"")
1598 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1599 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001600
1601 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001602 self.assertEqual(encoder.encode("\xe4x"), b"")
1603 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1604 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001605
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001606 def test_errors(self):
1607 """Only supports "strict" error handler"""
1608 "python.org".encode("idna", "strict")
1609 b"python.org".decode("idna", "strict")
1610 for errors in ("ignore", "replace", "backslashreplace",
1611 "surrogateescape"):
1612 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1613 self.assertRaises(Exception,
1614 b"python.org".decode, "idna", errors)
1615
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001616class CodecsModuleTest(unittest.TestCase):
1617
1618 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001619 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1620 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001621 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001622 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001623 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001624
Victor Stinnera57dfd02014-05-14 17:13:14 +02001625 # test keywords
1626 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1627 '\xe4\xf6\xfc')
1628 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1629 '[]')
1630
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001631 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1633 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001634 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001635 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001636 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001637 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001638
Victor Stinnera57dfd02014-05-14 17:13:14 +02001639 # test keywords
1640 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1641 b'\xe4\xf6\xfc')
1642 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1643 b'[]')
1644
Walter Dörwald063e1e82004-10-28 13:04:26 +00001645 def test_register(self):
1646 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001647 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001648
1649 def test_lookup(self):
1650 self.assertRaises(TypeError, codecs.lookup)
1651 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001652 self.assertRaises(LookupError, codecs.lookup, " ")
1653
1654 def test_getencoder(self):
1655 self.assertRaises(TypeError, codecs.getencoder)
1656 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1657
1658 def test_getdecoder(self):
1659 self.assertRaises(TypeError, codecs.getdecoder)
1660 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1661
1662 def test_getreader(self):
1663 self.assertRaises(TypeError, codecs.getreader)
1664 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1665
1666 def test_getwriter(self):
1667 self.assertRaises(TypeError, codecs.getwriter)
1668 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001669
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001670 def test_lookup_issue1813(self):
1671 # Issue #1813: under Turkish locales, lookup of some codecs failed
1672 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001673 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001674 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1675 try:
1676 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1677 except locale.Error:
1678 # Unsupported locale on this system
1679 self.skipTest('test needs Turkish locale')
1680 c = codecs.lookup('ASCII')
1681 self.assertEqual(c.name, 'ascii')
1682
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001683 def test_all(self):
1684 api = (
1685 "encode", "decode",
1686 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1687 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1688 "getencoder", "getdecoder", "getincrementalencoder",
1689 "getincrementaldecoder", "getreader", "getwriter",
1690 "register_error", "lookup_error",
1691 "strict_errors", "replace_errors", "ignore_errors",
1692 "xmlcharrefreplace_errors", "backslashreplace_errors",
1693 "namereplace_errors",
1694 "open", "EncodedFile",
1695 "iterencode", "iterdecode",
1696 "BOM", "BOM_BE", "BOM_LE",
1697 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1698 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1699 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1700 "StreamReaderWriter", "StreamRecoder",
1701 )
1702 self.assertCountEqual(api, codecs.__all__)
1703 for api in codecs.__all__:
1704 getattr(codecs, api)
1705
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001706 def test_open(self):
1707 self.addCleanup(support.unlink, support.TESTFN)
1708 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1709 with self.subTest(mode), \
1710 codecs.open(support.TESTFN, mode, 'ascii') as file:
1711 self.assertIsInstance(file, codecs.StreamReaderWriter)
1712
1713 def test_undefined(self):
1714 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1715 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1716 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1717 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1718 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1719 self.assertRaises(UnicodeError,
1720 codecs.encode, 'abc', 'undefined', errors)
1721 self.assertRaises(UnicodeError,
1722 codecs.decode, b'abc', 'undefined', errors)
1723
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001724class StreamReaderTest(unittest.TestCase):
1725
1726 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001727 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001728 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001729
1730 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001731 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001732 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001733
Thomas Wouters89f507f2006-12-13 04:49:30 +00001734class EncodedFileTest(unittest.TestCase):
1735
1736 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001737 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001738 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001739 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001740
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001741 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001742 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001743 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001744 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001745
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001746all_unicode_encodings = [
1747 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001748 "big5",
1749 "big5hkscs",
1750 "charmap",
1751 "cp037",
1752 "cp1006",
1753 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001754 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001755 "cp1140",
1756 "cp1250",
1757 "cp1251",
1758 "cp1252",
1759 "cp1253",
1760 "cp1254",
1761 "cp1255",
1762 "cp1256",
1763 "cp1257",
1764 "cp1258",
1765 "cp424",
1766 "cp437",
1767 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001768 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001769 "cp737",
1770 "cp775",
1771 "cp850",
1772 "cp852",
1773 "cp855",
1774 "cp856",
1775 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001776 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001777 "cp860",
1778 "cp861",
1779 "cp862",
1780 "cp863",
1781 "cp864",
1782 "cp865",
1783 "cp866",
1784 "cp869",
1785 "cp874",
1786 "cp875",
1787 "cp932",
1788 "cp949",
1789 "cp950",
1790 "euc_jis_2004",
1791 "euc_jisx0213",
1792 "euc_jp",
1793 "euc_kr",
1794 "gb18030",
1795 "gb2312",
1796 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001797 "hp_roman8",
1798 "hz",
1799 "idna",
1800 "iso2022_jp",
1801 "iso2022_jp_1",
1802 "iso2022_jp_2",
1803 "iso2022_jp_2004",
1804 "iso2022_jp_3",
1805 "iso2022_jp_ext",
1806 "iso2022_kr",
1807 "iso8859_1",
1808 "iso8859_10",
1809 "iso8859_11",
1810 "iso8859_13",
1811 "iso8859_14",
1812 "iso8859_15",
1813 "iso8859_16",
1814 "iso8859_2",
1815 "iso8859_3",
1816 "iso8859_4",
1817 "iso8859_5",
1818 "iso8859_6",
1819 "iso8859_7",
1820 "iso8859_8",
1821 "iso8859_9",
1822 "johab",
1823 "koi8_r",
1824 "koi8_u",
1825 "latin_1",
1826 "mac_cyrillic",
1827 "mac_greek",
1828 "mac_iceland",
1829 "mac_latin2",
1830 "mac_roman",
1831 "mac_turkish",
1832 "palmos",
1833 "ptcp154",
1834 "punycode",
1835 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001836 "shift_jis",
1837 "shift_jis_2004",
1838 "shift_jisx0213",
1839 "tis_620",
1840 "unicode_escape",
1841 "unicode_internal",
1842 "utf_16",
1843 "utf_16_be",
1844 "utf_16_le",
1845 "utf_7",
1846 "utf_8",
1847]
1848
1849if hasattr(codecs, "mbcs_encode"):
1850 all_unicode_encodings.append("mbcs")
1851
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001852# The following encoding is not tested, because it's not supposed
1853# to work:
1854# "undefined"
1855
1856# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001857broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858 "punycode",
1859 "unicode_internal"
1860]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001861
Walter Dörwald3abcb012007-04-16 22:10:50 +00001862class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001863 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001864 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001865 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001866 name = codecs.lookup(encoding).name
1867 if encoding.endswith("_codec"):
1868 name += "_codec"
1869 elif encoding == "latin_1":
1870 name = "latin_1"
1871 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001872
Ezio Melottiadc417c2011-11-17 12:23:34 +02001873 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001874 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001875 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001876 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001877 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001878 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001879
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001880 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001881 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001882 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001883 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001884 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001885 for c in s:
1886 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001887 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001888 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001889 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001890 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001891 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001892 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001893 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001894 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001895 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001896 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001898 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001899 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001900 try:
1901 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001902 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001903 pass
1904 else:
1905 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001906 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001907 for c in s:
1908 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001909 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001910 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001911 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001912 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001913 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001914 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001915 self.assertEqual(decodedresult, s,
1916 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001917
1918 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001919 result = "".join(codecs.iterdecode(
1920 codecs.iterencode(s, encoding), encoding))
1921 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001922
1923 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001924 result = "".join(codecs.iterdecode(
1925 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001926 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001927
Victor Stinner554f3f02010-06-16 23:33:54 +00001928 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001929 # check incremental decoder/encoder with errors argument
1930 try:
1931 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001932 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001933 pass
1934 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001935 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001936 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001937 decodedresult = "".join(decoder.decode(bytes([c]))
1938 for c in encodedresult)
1939 self.assertEqual(decodedresult, s,
1940 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001941
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001942 @support.cpython_only
1943 def test_basics_capi(self):
1944 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1945 s = "abc123" # all codecs should be able to encode these
1946 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001947 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001948 # check incremental decoder/encoder (fetched via the C API)
1949 try:
1950 cencoder = codec_incrementalencoder(encoding)
1951 except LookupError: # no IncrementalEncoder
1952 pass
1953 else:
1954 # check C API
1955 encodedresult = b""
1956 for c in s:
1957 encodedresult += cencoder.encode(c)
1958 encodedresult += cencoder.encode("", True)
1959 cdecoder = codec_incrementaldecoder(encoding)
1960 decodedresult = ""
1961 for c in encodedresult:
1962 decodedresult += cdecoder.decode(bytes([c]))
1963 decodedresult += cdecoder.decode(b"", True)
1964 self.assertEqual(decodedresult, s,
1965 "encoding=%r" % encoding)
1966
1967 if encoding not in ("idna", "mbcs"):
1968 # check incremental decoder/encoder with errors argument
1969 try:
1970 cencoder = codec_incrementalencoder(encoding, "ignore")
1971 except LookupError: # no IncrementalEncoder
1972 pass
1973 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001974 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001975 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1976 decodedresult = "".join(cdecoder.decode(bytes([c]))
1977 for c in encodedresult)
1978 self.assertEqual(decodedresult, s,
1979 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001980
Walter Dörwald729c31f2005-03-14 19:06:30 +00001981 def test_seek(self):
1982 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001983 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001984 for encoding in all_unicode_encodings:
1985 if encoding == "idna": # FIXME: See SF bug #1163178
1986 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001987 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001988 continue
Victor Stinner05010702011-05-27 16:50:40 +02001989 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001990 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001991 # Test that calling seek resets the internal codec state and buffers
1992 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001993 data = reader.read()
1994 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001995
Walter Dörwalde22d3392005-11-17 08:52:34 +00001996 def test_bad_decode_args(self):
1997 for encoding in all_unicode_encodings:
1998 decoder = codecs.getdecoder(encoding)
1999 self.assertRaises(TypeError, decoder)
2000 if encoding not in ("idna", "punycode"):
2001 self.assertRaises(TypeError, decoder, 42)
2002
2003 def test_bad_encode_args(self):
2004 for encoding in all_unicode_encodings:
2005 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002006 with support.check_warnings():
2007 # unicode-internal has been deprecated
2008 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002009
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002010 def test_encoding_map_type_initialized(self):
2011 from encodings import cp1140
2012 # This used to crash, we are only verifying there's no crash.
2013 table_type = type(cp1140.encoding_table)
2014 self.assertEqual(table_type, table_type)
2015
Walter Dörwald3abcb012007-04-16 22:10:50 +00002016 def test_decoder_state(self):
2017 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002018 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002019 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002020 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002021 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2022 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2023
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002024class CharmapTest(unittest.TestCase):
2025 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002026 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002027 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002028 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002029 )
2030
Ezio Melottib3aedd42010-11-20 19:04:17 +00002031 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002032 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2033 ("\U0010FFFFbc", 3)
2034 )
2035
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002036 self.assertRaises(UnicodeDecodeError,
2037 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2038 )
2039
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002040 self.assertRaises(UnicodeDecodeError,
2041 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2042 )
2043
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002044 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002045 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002046 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002047 )
2048
Ezio Melottib3aedd42010-11-20 19:04:17 +00002049 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002050 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002051 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002052 )
2053
Ezio Melottib3aedd42010-11-20 19:04:17 +00002054 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002055 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2056 ("ab\\x02", 3)
2057 )
2058
2059 self.assertEqual(
2060 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2061 ("ab\\x02", 3)
2062 )
2063
2064 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002065 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002066 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002067 )
2068
Ezio Melottib3aedd42010-11-20 19:04:17 +00002069 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002070 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002071 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002072 )
2073
Guido van Rossum805365e2007-05-07 22:24:25 +00002074 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002075 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002076 codecs.charmap_decode(allbytes, "ignore", ""),
2077 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002078 )
2079
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002080 def test_decode_with_int2str_map(self):
2081 self.assertEqual(
2082 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2083 {0: 'a', 1: 'b', 2: 'c'}),
2084 ("abc", 3)
2085 )
2086
2087 self.assertEqual(
2088 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2089 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2090 ("AaBbCc", 3)
2091 )
2092
2093 self.assertEqual(
2094 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2095 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2096 ("\U0010FFFFbc", 3)
2097 )
2098
2099 self.assertEqual(
2100 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2101 {0: 'a', 1: 'b', 2: ''}),
2102 ("ab", 3)
2103 )
2104
2105 self.assertRaises(UnicodeDecodeError,
2106 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2107 {0: 'a', 1: 'b'}
2108 )
2109
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002110 self.assertRaises(UnicodeDecodeError,
2111 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2112 {0: 'a', 1: 'b', 2: None}
2113 )
2114
2115 # Issue #14850
2116 self.assertRaises(UnicodeDecodeError,
2117 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2118 {0: 'a', 1: 'b', 2: '\ufffe'}
2119 )
2120
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002121 self.assertEqual(
2122 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2123 {0: 'a', 1: 'b'}),
2124 ("ab\ufffd", 3)
2125 )
2126
2127 self.assertEqual(
2128 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2129 {0: 'a', 1: 'b', 2: None}),
2130 ("ab\ufffd", 3)
2131 )
2132
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002133 # Issue #14850
2134 self.assertEqual(
2135 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2136 {0: 'a', 1: 'b', 2: '\ufffe'}),
2137 ("ab\ufffd", 3)
2138 )
2139
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002140 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002141 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2142 {0: 'a', 1: 'b'}),
2143 ("ab\\x02", 3)
2144 )
2145
2146 self.assertEqual(
2147 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2148 {0: 'a', 1: 'b', 2: None}),
2149 ("ab\\x02", 3)
2150 )
2151
2152 # Issue #14850
2153 self.assertEqual(
2154 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2155 {0: 'a', 1: 'b', 2: '\ufffe'}),
2156 ("ab\\x02", 3)
2157 )
2158
2159 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002160 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2161 {0: 'a', 1: 'b'}),
2162 ("ab", 3)
2163 )
2164
2165 self.assertEqual(
2166 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2167 {0: 'a', 1: 'b', 2: None}),
2168 ("ab", 3)
2169 )
2170
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002171 # Issue #14850
2172 self.assertEqual(
2173 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2174 {0: 'a', 1: 'b', 2: '\ufffe'}),
2175 ("ab", 3)
2176 )
2177
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002178 allbytes = bytes(range(256))
2179 self.assertEqual(
2180 codecs.charmap_decode(allbytes, "ignore", {}),
2181 ("", len(allbytes))
2182 )
2183
2184 def test_decode_with_int2int_map(self):
2185 a = ord('a')
2186 b = ord('b')
2187 c = ord('c')
2188
2189 self.assertEqual(
2190 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2191 {0: a, 1: b, 2: c}),
2192 ("abc", 3)
2193 )
2194
2195 # Issue #15379
2196 self.assertEqual(
2197 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2198 {0: 0x10FFFF, 1: b, 2: c}),
2199 ("\U0010FFFFbc", 3)
2200 )
2201
Antoine Pitroua1f76552012-09-23 20:00:04 +02002202 self.assertEqual(
2203 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2204 {0: sys.maxunicode, 1: b, 2: c}),
2205 (chr(sys.maxunicode) + "bc", 3)
2206 )
2207
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002208 self.assertRaises(TypeError,
2209 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002210 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002211 )
2212
2213 self.assertRaises(UnicodeDecodeError,
2214 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2215 {0: a, 1: b},
2216 )
2217
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002218 self.assertRaises(UnicodeDecodeError,
2219 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2220 {0: a, 1: b, 2: 0xFFFE},
2221 )
2222
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002223 self.assertEqual(
2224 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2225 {0: a, 1: b}),
2226 ("ab\ufffd", 3)
2227 )
2228
2229 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002230 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2231 {0: a, 1: b, 2: 0xFFFE}),
2232 ("ab\ufffd", 3)
2233 )
2234
2235 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002236 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2237 {0: a, 1: b}),
2238 ("ab\\x02", 3)
2239 )
2240
2241 self.assertEqual(
2242 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2243 {0: a, 1: b, 2: 0xFFFE}),
2244 ("ab\\x02", 3)
2245 )
2246
2247 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002248 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2249 {0: a, 1: b}),
2250 ("ab", 3)
2251 )
2252
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002253 self.assertEqual(
2254 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2255 {0: a, 1: b, 2: 0xFFFE}),
2256 ("ab", 3)
2257 )
2258
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002259
Thomas Wouters89f507f2006-12-13 04:49:30 +00002260class WithStmtTest(unittest.TestCase):
2261 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002262 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002263 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2264 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002265 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002266
2267 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002268 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002269 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002270 with codecs.StreamReaderWriter(f, info.streamreader,
2271 info.streamwriter, 'strict') as srw:
2272 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002273
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002274class TypesTest(unittest.TestCase):
2275 def test_decode_unicode(self):
2276 # Most decoders don't accept unicode input
2277 decoders = [
2278 codecs.utf_7_decode,
2279 codecs.utf_8_decode,
2280 codecs.utf_16_le_decode,
2281 codecs.utf_16_be_decode,
2282 codecs.utf_16_ex_decode,
2283 codecs.utf_32_decode,
2284 codecs.utf_32_le_decode,
2285 codecs.utf_32_be_decode,
2286 codecs.utf_32_ex_decode,
2287 codecs.latin_1_decode,
2288 codecs.ascii_decode,
2289 codecs.charmap_decode,
2290 ]
2291 if hasattr(codecs, "mbcs_decode"):
2292 decoders.append(codecs.mbcs_decode)
2293 for decoder in decoders:
2294 self.assertRaises(TypeError, decoder, "xxx")
2295
2296 def test_unicode_escape(self):
2297 # Escape-decoding an unicode string is supported ang gives the same
2298 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002299 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2300 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2301 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2302 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002303
Victor Stinnere3b47152011-12-09 20:49:49 +01002304 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2305 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002306 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2307 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002308
2309 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2310 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002311 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2312 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002313
Serhiy Storchakad6793772013-01-29 10:20:44 +02002314
2315class UnicodeEscapeTest(unittest.TestCase):
2316 def test_empty(self):
2317 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2318 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2319
2320 def test_raw_encode(self):
2321 encode = codecs.unicode_escape_encode
2322 for b in range(32, 127):
2323 if b != b'\\'[0]:
2324 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2325
2326 def test_raw_decode(self):
2327 decode = codecs.unicode_escape_decode
2328 for b in range(256):
2329 if b != b'\\'[0]:
2330 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2331
2332 def test_escape_encode(self):
2333 encode = codecs.unicode_escape_encode
2334 check = coding_checker(self, encode)
2335 check('\t', br'\t')
2336 check('\n', br'\n')
2337 check('\r', br'\r')
2338 check('\\', br'\\')
2339 for b in range(32):
2340 if chr(b) not in '\t\n\r':
2341 check(chr(b), ('\\x%02x' % b).encode())
2342 for b in range(127, 256):
2343 check(chr(b), ('\\x%02x' % b).encode())
2344 check('\u20ac', br'\u20ac')
2345 check('\U0001d120', br'\U0001d120')
2346
2347 def test_escape_decode(self):
2348 decode = codecs.unicode_escape_decode
2349 check = coding_checker(self, decode)
2350 check(b"[\\\n]", "[]")
2351 check(br'[\"]', '["]')
2352 check(br"[\']", "[']")
2353 check(br"[\\]", r"[\]")
2354 check(br"[\a]", "[\x07]")
2355 check(br"[\b]", "[\x08]")
2356 check(br"[\t]", "[\x09]")
2357 check(br"[\n]", "[\x0a]")
2358 check(br"[\v]", "[\x0b]")
2359 check(br"[\f]", "[\x0c]")
2360 check(br"[\r]", "[\x0d]")
2361 check(br"[\7]", "[\x07]")
2362 check(br"[\8]", r"[\8]")
2363 check(br"[\78]", "[\x078]")
2364 check(br"[\41]", "[!]")
2365 check(br"[\418]", "[!8]")
2366 check(br"[\101]", "[A]")
2367 check(br"[\1010]", "[A0]")
2368 check(br"[\x41]", "[A]")
2369 check(br"[\x410]", "[A0]")
2370 check(br"\u20ac", "\u20ac")
2371 check(br"\U0001d120", "\U0001d120")
2372 for b in range(256):
2373 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2374 check(b'\\' + bytes([b]), '\\' + chr(b))
2375
2376 def test_decode_errors(self):
2377 decode = codecs.unicode_escape_decode
2378 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2379 for i in range(d):
2380 self.assertRaises(UnicodeDecodeError, decode,
2381 b"\\" + c + b"0"*i)
2382 self.assertRaises(UnicodeDecodeError, decode,
2383 b"[\\" + c + b"0"*i + b"]")
2384 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2385 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2386 self.assertEqual(decode(data, "replace"),
2387 ("[\ufffd]\ufffd", len(data)))
2388 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2389 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2390 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2391
2392
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002393class RawUnicodeEscapeTest(unittest.TestCase):
2394 def test_empty(self):
2395 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2396 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2397
2398 def test_raw_encode(self):
2399 encode = codecs.raw_unicode_escape_encode
2400 for b in range(256):
2401 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2402
2403 def test_raw_decode(self):
2404 decode = codecs.raw_unicode_escape_decode
2405 for b in range(256):
2406 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2407
2408 def test_escape_encode(self):
2409 encode = codecs.raw_unicode_escape_encode
2410 check = coding_checker(self, encode)
2411 for b in range(256):
2412 if b not in b'uU':
2413 check('\\' + chr(b), b'\\' + bytes([b]))
2414 check('\u20ac', br'\u20ac')
2415 check('\U0001d120', br'\U0001d120')
2416
2417 def test_escape_decode(self):
2418 decode = codecs.raw_unicode_escape_decode
2419 check = coding_checker(self, decode)
2420 for b in range(256):
2421 if b not in b'uU':
2422 check(b'\\' + bytes([b]), '\\' + chr(b))
2423 check(br"\u20ac", "\u20ac")
2424 check(br"\U0001d120", "\U0001d120")
2425
2426 def test_decode_errors(self):
2427 decode = codecs.raw_unicode_escape_decode
2428 for c, d in (b'u', 4), (b'U', 4):
2429 for i in range(d):
2430 self.assertRaises(UnicodeDecodeError, decode,
2431 b"\\" + c + b"0"*i)
2432 self.assertRaises(UnicodeDecodeError, decode,
2433 b"[\\" + c + b"0"*i + b"]")
2434 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2435 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2436 self.assertEqual(decode(data, "replace"),
2437 ("[\ufffd]\ufffd", len(data)))
2438 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2439 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2440 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2441
2442
Martin v. Löwis43c57782009-05-10 08:15:24 +00002443class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002444
2445 def test_utf8(self):
2446 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002447 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002448 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002449 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002450 b"foo\x80bar")
2451 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002452 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002453 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002454 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002455 b"\xed\xb0\x80")
2456
2457 def test_ascii(self):
2458 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002459 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002460 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002461 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002462 b"foo\x80bar")
2463
2464 def test_charmap(self):
2465 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002466 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002467 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002468 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002469 b"foo\xa5bar")
2470
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002471 def test_latin1(self):
2472 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002473 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002474 b"\xe4\xeb\xef\xf6\xfc")
2475
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002476
Victor Stinner3fed0872010-05-22 02:16:27 +00002477class BomTest(unittest.TestCase):
2478 def test_seek0(self):
2479 data = "1234567890"
2480 tests = ("utf-16",
2481 "utf-16-le",
2482 "utf-16-be",
2483 "utf-32",
2484 "utf-32-le",
2485 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002486 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002487 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002488 # Check if the BOM is written only once
2489 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002490 f.write(data)
2491 f.write(data)
2492 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002493 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002494 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002495 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002496
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002497 # Check that the BOM is written after a seek(0)
2498 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2499 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002500 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002501 f.seek(0)
2502 f.write(data)
2503 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002504 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002505
2506 # (StreamWriter) Check that the BOM is written after a seek(0)
2507 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002508 f.writer.write(data[0])
2509 self.assertNotEqual(f.writer.tell(), 0)
2510 f.writer.seek(0)
2511 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002512 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002513 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002514
Victor Stinner05010702011-05-27 16:50:40 +02002515 # Check that the BOM is not written after a seek() at a position
2516 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002517 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2518 f.write(data)
2519 f.seek(f.tell())
2520 f.write(data)
2521 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002522 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002523
Victor Stinner05010702011-05-27 16:50:40 +02002524 # (StreamWriter) Check that the BOM is not written after a seek()
2525 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002526 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002527 f.writer.write(data)
2528 f.writer.seek(f.writer.tell())
2529 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002530 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002531 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002532
Victor Stinner3fed0872010-05-22 02:16:27 +00002533
Georg Brandl02524622010-12-02 18:06:51 +00002534bytes_transform_encodings = [
2535 "base64_codec",
2536 "uu_codec",
2537 "quopri_codec",
2538 "hex_codec",
2539]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002540
2541transform_aliases = {
2542 "base64_codec": ["base64", "base_64"],
2543 "uu_codec": ["uu"],
2544 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2545 "hex_codec": ["hex"],
2546 "rot_13": ["rot13"],
2547}
2548
Georg Brandl02524622010-12-02 18:06:51 +00002549try:
2550 import zlib
2551except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002552 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002553else:
2554 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002555 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002556try:
2557 import bz2
2558except ImportError:
2559 pass
2560else:
2561 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002562 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002563
2564class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002565
Georg Brandl02524622010-12-02 18:06:51 +00002566 def test_basics(self):
2567 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002568 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002569 with self.subTest(encoding=encoding):
2570 # generic codecs interface
2571 (o, size) = codecs.getencoder(encoding)(binput)
2572 self.assertEqual(size, len(binput))
2573 (i, size) = codecs.getdecoder(encoding)(o)
2574 self.assertEqual(size, len(o))
2575 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002576
Georg Brandl02524622010-12-02 18:06:51 +00002577 def test_read(self):
2578 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002579 with self.subTest(encoding=encoding):
2580 sin = codecs.encode(b"\x80", encoding)
2581 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2582 sout = reader.read()
2583 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002584
2585 def test_readline(self):
2586 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002587 with self.subTest(encoding=encoding):
2588 sin = codecs.encode(b"\x80", encoding)
2589 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2590 sout = reader.readline()
2591 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002592
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002593 def test_buffer_api_usage(self):
2594 # We check all the transform codecs accept memoryview input
2595 # for encoding and decoding
2596 # and also that they roundtrip correctly
2597 original = b"12345\x80"
2598 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002599 with self.subTest(encoding=encoding):
2600 data = original
2601 view = memoryview(data)
2602 data = codecs.encode(data, encoding)
2603 view_encoded = codecs.encode(view, encoding)
2604 self.assertEqual(view_encoded, data)
2605 view = memoryview(data)
2606 data = codecs.decode(data, encoding)
2607 self.assertEqual(data, original)
2608 view_decoded = codecs.decode(view, encoding)
2609 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002610
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002611 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002612 # Check binary -> binary codecs give a good error for str input
2613 bad_input = "bad input type"
2614 for encoding in bytes_transform_encodings:
2615 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002616 fmt = ( "{!r} is not a text encoding; "
2617 "use codecs.encode\(\) to handle arbitrary codecs")
2618 msg = fmt.format(encoding)
2619 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002620 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002621 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002622
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002623 def test_text_to_binary_blacklists_text_transforms(self):
2624 # Check str.encode gives a good error message for str -> str codecs
2625 msg = (r"^'rot_13' is not a text encoding; "
2626 "use codecs.encode\(\) to handle arbitrary codecs")
2627 with self.assertRaisesRegex(LookupError, msg):
2628 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002629
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002630 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002631 # Check bytes.decode and bytearray.decode give a good error
2632 # message for binary -> binary codecs
2633 data = b"encode first to ensure we meet any format restrictions"
2634 for encoding in bytes_transform_encodings:
2635 with self.subTest(encoding=encoding):
2636 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002637 fmt = (r"{!r} is not a text encoding; "
2638 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002639 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002641 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002642 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002643 bytearray(encoded_data).decode(encoding)
2644
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 def test_binary_to_text_blacklists_text_transforms(self):
2646 # Check str -> str codec gives a good error for binary input
2647 for bad_input in (b"immutable", bytearray(b"mutable")):
2648 with self.subTest(bad_input=bad_input):
2649 msg = (r"^'rot_13' is not a text encoding; "
2650 "use codecs.decode\(\) to handle arbitrary codecs")
2651 with self.assertRaisesRegex(LookupError, msg) as failure:
2652 bad_input.decode("rot_13")
2653 self.assertIsNone(failure.exception.__cause__)
2654
Zachary Wareefa2e042013-12-30 14:54:11 -06002655 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002656 def test_custom_zlib_error_is_wrapped(self):
2657 # Check zlib codec gives a good error for malformed input
2658 msg = "^decoding with 'zlib_codec' codec failed"
2659 with self.assertRaisesRegex(Exception, msg) as failure:
2660 codecs.decode(b"hello", "zlib_codec")
2661 self.assertIsInstance(failure.exception.__cause__,
2662 type(failure.exception))
2663
2664 def test_custom_hex_error_is_wrapped(self):
2665 # Check hex codec gives a good error for malformed input
2666 msg = "^decoding with 'hex_codec' codec failed"
2667 with self.assertRaisesRegex(Exception, msg) as failure:
2668 codecs.decode(b"hello", "hex_codec")
2669 self.assertIsInstance(failure.exception.__cause__,
2670 type(failure.exception))
2671
2672 # Unfortunately, the bz2 module throws OSError, which the codec
2673 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002674
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002675 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2676 def test_aliases(self):
2677 for codec_name, aliases in transform_aliases.items():
2678 expected_name = codecs.lookup(codec_name).name
2679 for alias in aliases:
2680 with self.subTest(alias=alias):
2681 info = codecs.lookup(alias)
2682 self.assertEqual(info.name, expected_name)
2683
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002684 def test_uu_invalid(self):
2685 # Missing "begin" line
2686 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2687
Nick Coghlan8b097b42013-11-13 23:49:21 +10002688
2689# The codec system tries to wrap exceptions in order to ensure the error
2690# mentions the operation being performed and the codec involved. We
2691# currently *only* want this to happen for relatively stateless
2692# exceptions, where the only significant information they contain is their
2693# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002694
2695# Use a local codec registry to avoid appearing to leak objects when
2696# registering multiple seach functions
2697_TEST_CODECS = {}
2698
2699def _get_test_codec(codec_name):
2700 return _TEST_CODECS.get(codec_name)
2701codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2702
Nick Coghlan8fad1672014-09-15 23:50:44 +12002703try:
2704 # Issue #22166: Also need to clear the internal cache in CPython
2705 from _codecs import _forget_codec
2706except ImportError:
2707 def _forget_codec(codec_name):
2708 pass
2709
2710
Nick Coghlan8b097b42013-11-13 23:49:21 +10002711class ExceptionChainingTest(unittest.TestCase):
2712
2713 def setUp(self):
2714 # There's no way to unregister a codec search function, so we just
2715 # ensure we render this one fairly harmless after the test
2716 # case finishes by using the test case repr as the codec name
2717 # The codecs module normalizes codec names, although this doesn't
2718 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002719 # We also make sure we use a truly unique id for the custom codec
2720 # to avoid issues with the codec cache when running these tests
2721 # multiple times (e.g. when hunting for refleaks)
2722 unique_id = repr(self) + str(id(self))
2723 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2724
2725 # We store the object to raise on the instance because of a bad
2726 # interaction between the codec caching (which means we can't
2727 # recreate the codec entry) and regrtest refleak hunting (which
2728 # runs the same test instance multiple times). This means we
2729 # need to ensure the codecs call back in to the instance to find
2730 # out which exception to raise rather than binding them in a
2731 # closure to an object that may change on the next run
2732 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002733
Nick Coghlan4e553e22013-11-16 00:35:34 +10002734 def tearDown(self):
2735 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002736 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2737 encodings._cache.pop(self.codec_name, None)
2738 try:
2739 _forget_codec(self.codec_name)
2740 except KeyError:
2741 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002742
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002743 def set_codec(self, encode, decode):
2744 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002745 name=self.codec_name)
2746 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002747
2748 @contextlib.contextmanager
2749 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002750 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002751 operation, self.codec_name, exc_type.__name__, msg)
2752 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2753 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002754 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002755 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002756
2757 def raise_obj(self, *args, **kwds):
2758 # Helper to dynamically change the object raised by a test codec
2759 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002760
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002761 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002762 self.obj_to_raise = obj_to_raise
2763 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002764 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002765 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002766 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002767 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002768 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002769 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002770 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002771 codecs.decode(b"bytes input", self.codec_name)
2772
2773 def test_raise_by_type(self):
2774 self.check_wrapped(RuntimeError, "")
2775
2776 def test_raise_by_value(self):
2777 msg = "This should be wrapped"
2778 self.check_wrapped(RuntimeError(msg), msg)
2779
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002780 def test_raise_grandchild_subclass_exact_size(self):
2781 msg = "This should be wrapped"
2782 class MyRuntimeError(RuntimeError):
2783 __slots__ = ()
2784 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2785
2786 def test_raise_subclass_with_weakref_support(self):
2787 msg = "This should be wrapped"
2788 class MyRuntimeError(RuntimeError):
2789 pass
2790 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2791
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002792 def check_not_wrapped(self, obj_to_raise, msg):
2793 def raise_obj(*args, **kwds):
2794 raise obj_to_raise
2795 self.set_codec(raise_obj, raise_obj)
2796 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002797 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002798 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002799 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002800 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002801 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002802 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002803 codecs.decode(b"bytes input", self.codec_name)
2804
2805 def test_init_override_is_not_wrapped(self):
2806 class CustomInit(RuntimeError):
2807 def __init__(self):
2808 pass
2809 self.check_not_wrapped(CustomInit, "")
2810
2811 def test_new_override_is_not_wrapped(self):
2812 class CustomNew(RuntimeError):
2813 def __new__(cls):
2814 return super().__new__(cls)
2815 self.check_not_wrapped(CustomNew, "")
2816
2817 def test_instance_attribute_is_not_wrapped(self):
2818 msg = "This should NOT be wrapped"
2819 exc = RuntimeError(msg)
2820 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002821 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002822
2823 def test_non_str_arg_is_not_wrapped(self):
2824 self.check_not_wrapped(RuntimeError(1), "1")
2825
2826 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002827 msg_re = r"^\('a', 'b', 'c'\)$"
2828 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002829
2830 # http://bugs.python.org/issue19609
2831 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002832 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002833 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002834 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002835 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002836 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002837 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002838 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002839 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002840 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002841 codecs.decode(b"bytes input", self.codec_name)
2842
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002843 def test_unflagged_non_text_codec_handling(self):
2844 # The stdlib non-text codecs are now marked so they're
2845 # pre-emptively skipped by the text model related methods
2846 # However, third party codecs won't be flagged, so we still make
2847 # sure the case where an inappropriate output type is produced is
2848 # handled appropriately
2849 def encode_to_str(*args, **kwds):
2850 return "not bytes!", 0
2851 def decode_to_bytes(*args, **kwds):
2852 return b"not str!", 0
2853 self.set_codec(encode_to_str, decode_to_bytes)
2854 # No input or output type checks on the codecs module functions
2855 encoded = codecs.encode(None, self.codec_name)
2856 self.assertEqual(encoded, "not bytes!")
2857 decoded = codecs.decode(None, self.codec_name)
2858 self.assertEqual(decoded, b"not str!")
2859 # Text model methods should complain
2860 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2861 "use codecs.encode\(\) to encode to arbitrary types$")
2862 msg = fmt.format(self.codec_name)
2863 with self.assertRaisesRegex(TypeError, msg):
2864 "str_input".encode(self.codec_name)
2865 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2866 "use codecs.decode\(\) to decode to arbitrary types$")
2867 msg = fmt.format(self.codec_name)
2868 with self.assertRaisesRegex(TypeError, msg):
2869 b"bytes input".decode(self.codec_name)
2870
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002871
Georg Brandl02524622010-12-02 18:06:51 +00002872
Victor Stinner62be4fb2011-10-18 21:46:37 +02002873@unittest.skipUnless(sys.platform == 'win32',
2874 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002875class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002876 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002877 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002878
Victor Stinner3a50e702011-10-18 21:21:00 +02002879 def test_invalid_code_page(self):
2880 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2881 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002882 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2883 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002884
2885 def test_code_page_name(self):
2886 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2887 codecs.code_page_encode, 932, '\xff')
2888 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002889 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002890 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002891 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002892
2893 def check_decode(self, cp, tests):
2894 for raw, errors, expected in tests:
2895 if expected is not None:
2896 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002897 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002898 except UnicodeDecodeError as err:
2899 self.fail('Unable to decode %a from "cp%s" with '
2900 'errors=%r: %s' % (raw, cp, errors, err))
2901 self.assertEqual(decoded[0], expected,
2902 '%a.decode("cp%s", %r)=%a != %a'
2903 % (raw, cp, errors, decoded[0], expected))
2904 # assert 0 <= decoded[1] <= len(raw)
2905 self.assertGreaterEqual(decoded[1], 0)
2906 self.assertLessEqual(decoded[1], len(raw))
2907 else:
2908 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002909 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002910
2911 def check_encode(self, cp, tests):
2912 for text, errors, expected in tests:
2913 if expected is not None:
2914 try:
2915 encoded = codecs.code_page_encode(cp, text, errors)
2916 except UnicodeEncodeError as err:
2917 self.fail('Unable to encode %a to "cp%s" with '
2918 'errors=%r: %s' % (text, cp, errors, err))
2919 self.assertEqual(encoded[0], expected,
2920 '%a.encode("cp%s", %r)=%a != %a'
2921 % (text, cp, errors, encoded[0], expected))
2922 self.assertEqual(encoded[1], len(text))
2923 else:
2924 self.assertRaises(UnicodeEncodeError,
2925 codecs.code_page_encode, cp, text, errors)
2926
2927 def test_cp932(self):
2928 self.check_encode(932, (
2929 ('abc', 'strict', b'abc'),
2930 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002931 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002932 ('\xff', 'strict', None),
2933 ('[\xff]', 'ignore', b'[]'),
2934 ('[\xff]', 'replace', b'[y]'),
2935 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002936 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002937 ('[\xff]', 'namereplace',
2938 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002939 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002940 ('\udcff', 'strict', None),
2941 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2942 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002943 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002944 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002945 (b'abc', 'strict', 'abc'),
2946 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2947 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002948 (b'[\xff]', 'strict', None),
2949 (b'[\xff]', 'ignore', '[]'),
2950 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002951 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002952 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002953 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002954 (b'\x81\x00abc', 'strict', None),
2955 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002956 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002957 (b'\x81\x00abc', 'backslashreplace', '\\xff\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002958 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002959
2960 def test_cp1252(self):
2961 self.check_encode(1252, (
2962 ('abc', 'strict', b'abc'),
2963 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2964 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002965 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002966 ('\u0141', 'strict', None),
2967 ('\u0141', 'ignore', b''),
2968 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002969 ('\udc98', 'surrogateescape', b'\x98'),
2970 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002971 ))
2972 self.check_decode(1252, (
2973 (b'abc', 'strict', 'abc'),
2974 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2975 (b'\xff', 'strict', '\xff'),
2976 ))
2977
2978 def test_cp_utf7(self):
2979 cp = 65000
2980 self.check_encode(cp, (
2981 ('abc', 'strict', b'abc'),
2982 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2983 ('\U0010ffff', 'strict', b'+2//f/w-'),
2984 ('\udc80', 'strict', b'+3IA-'),
2985 ('\ufffd', 'strict', b'+//0-'),
2986 ))
2987 self.check_decode(cp, (
2988 (b'abc', 'strict', 'abc'),
2989 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2990 (b'+2//f/w-', 'strict', '\U0010ffff'),
2991 (b'+3IA-', 'strict', '\udc80'),
2992 (b'+//0-', 'strict', '\ufffd'),
2993 # invalid bytes
2994 (b'[+/]', 'strict', '[]'),
2995 (b'[\xff]', 'strict', '[\xff]'),
2996 ))
2997
Victor Stinner3a50e702011-10-18 21:21:00 +02002998 def test_multibyte_encoding(self):
2999 self.check_decode(932, (
3000 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3001 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3002 ))
3003 self.check_decode(self.CP_UTF8, (
3004 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3005 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3006 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003007 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003008 self.check_encode(self.CP_UTF8, (
3009 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3010 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3011 ))
3012
3013 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003014 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3015 self.assertEqual(decoded, ('', 0))
3016
Victor Stinner3a50e702011-10-18 21:21:00 +02003017 decoded = codecs.code_page_decode(932,
3018 b'\xe9\x80\xe9', 'strict',
3019 False)
3020 self.assertEqual(decoded, ('\u9a3e', 2))
3021
3022 decoded = codecs.code_page_decode(932,
3023 b'\xe9\x80\xe9\x80', 'strict',
3024 False)
3025 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3026
3027 decoded = codecs.code_page_decode(932,
3028 b'abc', 'strict',
3029 False)
3030 self.assertEqual(decoded, ('abc', 3))
3031
3032
Fred Drake2e2be372001-09-20 21:33:42 +00003033if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003034 unittest.main()