blob: 29788643728c21eb1eed57460083a3c4d8d81c8c [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
Ezio Melotti5d3dba02013-01-11 06:02:07 +020077class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200297class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200333 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200355 "\x00\xff\u0100\uffff",
356 "\x00\xff\u0100\uffff",
357 "\x00\xff\u0100\uffff",
358 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000359 ]
360 )
361
Georg Brandl791f4e12009-09-17 11:41:24 +0000362 def test_handlers(self):
363 self.assertEqual(('\ufffd', 1),
364 codecs.utf_32_decode(b'\x01', 'replace', True))
365 self.assertEqual(('', 1),
366 codecs.utf_32_decode(b'\x01', 'ignore', True))
367
Walter Dörwald41980ca2007-08-16 21:55:45 +0000368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
370 b"\xff", "strict", True)
371
372 def test_decoder_state(self):
373 self.check_state_handling_decode(self.encoding,
374 "spamspam", self.spamle)
375 self.check_state_handling_decode(self.encoding,
376 "spamspam", self.spambe)
377
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000378 def test_issue8941(self):
379 # Issue #8941: insufficient result allocation when decoding into
380 # surrogate pairs on UCS-2 builds.
381 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
382 self.assertEqual('\U00010000' * 1024,
383 codecs.utf_32_decode(encoded_le)[0])
384 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
385 self.assertEqual('\U00010000' * 1024,
386 codecs.utf_32_decode(encoded_be)[0])
387
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200388class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000389 encoding = "utf-32-le"
390
391 def test_partial(self):
392 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200393 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000394 [
395 "",
396 "",
397 "",
398 "\x00",
399 "\x00",
400 "\x00",
401 "\x00",
402 "\x00\xff",
403 "\x00\xff",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff\u0100",
407 "\x00\xff\u0100",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200411 "\x00\xff\u0100\uffff",
412 "\x00\xff\u0100\uffff",
413 "\x00\xff\u0100\uffff",
414 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415 ]
416 )
417
418 def test_simple(self):
419 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
420
421 def test_errors(self):
422 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
423 b"\xff", "strict", True)
424
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000425 def test_issue8941(self):
426 # Issue #8941: insufficient result allocation when decoding into
427 # surrogate pairs on UCS-2 builds.
428 encoded = b'\x00\x00\x01\x00' * 1024
429 self.assertEqual('\U00010000' * 1024,
430 codecs.utf_32_le_decode(encoded)[0])
431
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200432class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000433 encoding = "utf-32-be"
434
435 def test_partial(self):
436 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200437 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 [
439 "",
440 "",
441 "",
442 "\x00",
443 "\x00",
444 "\x00",
445 "\x00",
446 "\x00\xff",
447 "\x00\xff",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200455 "\x00\xff\u0100\uffff",
456 "\x00\xff\u0100\uffff",
457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000459 ]
460 )
461
462 def test_simple(self):
463 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
464
465 def test_errors(self):
466 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
467 b"\xff", "strict", True)
468
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000469 def test_issue8941(self):
470 # Issue #8941: insufficient result allocation when decoding into
471 # surrogate pairs on UCS-2 builds.
472 encoded = b'\x00\x01\x00\x00' * 1024
473 self.assertEqual('\U00010000' * 1024,
474 codecs.utf_32_be_decode(encoded)[0])
475
476
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200477class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000478 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000479
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000480 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
481 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000482
483 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000484 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000486 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200487 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000488 f.write("spam")
489 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490 d = s.getvalue()
491 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000492 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000493 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000494 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200495 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000496 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000497
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000499 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200500 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000501 self.assertRaises(UnicodeError, f.read)
502
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000503 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200504 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000505 self.assertRaises(UnicodeError, f.read)
506
Walter Dörwald69652032004-09-07 20:24:22 +0000507 def test_partial(self):
508 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200509 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000510 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000511 "", # first byte of BOM read
512 "", # second byte of BOM read => byteorder known
513 "",
514 "\x00",
515 "\x00",
516 "\x00\xff",
517 "\x00\xff",
518 "\x00\xff\u0100",
519 "\x00\xff\u0100",
520 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200521 "\x00\xff\u0100\uffff",
522 "\x00\xff\u0100\uffff",
523 "\x00\xff\u0100\uffff",
524 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000525 ]
526 )
527
Georg Brandl791f4e12009-09-17 11:41:24 +0000528 def test_handlers(self):
529 self.assertEqual(('\ufffd', 1),
530 codecs.utf_16_decode(b'\x01', 'replace', True))
531 self.assertEqual(('', 1),
532 codecs.utf_16_decode(b'\x01', 'ignore', True))
533
Walter Dörwalde22d3392005-11-17 08:52:34 +0000534 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000535 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000536 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000537
538 def test_decoder_state(self):
539 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000541 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000544 def test_bug691291(self):
545 # Files are always opened in binary mode, even if no binary mode was
546 # specified. This means that no automatic conversion of '\n' is done
547 # on reading and writing.
548 s1 = 'Hello\r\nworld\r\n'
549
550 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200551 self.addCleanup(support.unlink, support.TESTFN)
552 with open(support.TESTFN, 'wb') as fp:
553 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200554 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200555 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000556
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200557class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000558 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000559
560 def test_partial(self):
561 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200562 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000563 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000564 "",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100",
571 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff",
575 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 ]
577 )
578
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200580 tests = [
581 (b'\xff', '\ufffd'),
582 (b'A\x00Z', 'A\ufffd'),
583 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
584 (b'\x00\xd8', '\ufffd'),
585 (b'\x00\xd8A', '\ufffd'),
586 (b'\x00\xd8A\x00', '\ufffdA'),
587 (b'\x00\xdcA\x00', '\ufffdA'),
588 ]
589 for raw, expected in tests:
590 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
591 raw, 'strict', True)
592 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000593
Victor Stinner53a9dd72010-12-08 22:25:45 +0000594 def test_nonbmp(self):
595 self.assertEqual("\U00010203".encode(self.encoding),
596 b'\x00\xd8\x03\xde')
597 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
598 "\U00010203")
599
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200600class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000601 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000602
603 def test_partial(self):
604 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200605 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000606 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 "",
608 "\x00",
609 "\x00",
610 "\x00\xff",
611 "\x00\xff",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000619 ]
620 )
621
Walter Dörwalde22d3392005-11-17 08:52:34 +0000622 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200623 tests = [
624 (b'\xff', '\ufffd'),
625 (b'\x00A\xff', 'A\ufffd'),
626 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
627 (b'\xd8\x00', '\ufffd'),
628 (b'\xd8\x00\xdc', '\ufffd'),
629 (b'\xd8\x00\x00A', '\ufffdA'),
630 (b'\xdc\x00\x00A', '\ufffdA'),
631 ]
632 for raw, expected in tests:
633 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
634 raw, 'strict', True)
635 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Victor Stinner53a9dd72010-12-08 22:25:45 +0000637 def test_nonbmp(self):
638 self.assertEqual("\U00010203".encode(self.encoding),
639 b'\xd8\x00\xde\x03')
640 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
641 "\U00010203")
642
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200643class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000644 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000645
646 def test_partial(self):
647 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200648 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000649 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000650 "\x00",
651 "\x00",
652 "\x00\xff",
653 "\x00\xff",
654 "\x00\xff\u07ff",
655 "\x00\xff\u07ff",
656 "\x00\xff\u07ff",
657 "\x00\xff\u07ff\u0800",
658 "\x00\xff\u07ff\u0800",
659 "\x00\xff\u07ff\u0800",
660 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200661 "\x00\xff\u07ff\u0800\uffff",
662 "\x00\xff\u07ff\u0800\uffff",
663 "\x00\xff\u07ff\u0800\uffff",
664 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 ]
666 )
667
Walter Dörwald3abcb012007-04-16 22:10:50 +0000668 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000669 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000670 self.check_state_handling_decode(self.encoding,
671 u, u.encode(self.encoding))
672
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000673 def test_lone_surrogates(self):
674 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
675 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000676 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
677 b'[\\udc80]')
678 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
679 b'[&#56448;]')
680 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
681 b'[\x80]')
682 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
683 b'[]')
684 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
685 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000686
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000687 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000688 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
689 b"abc\xed\xa0\x80def")
690 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
691 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
693 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
694 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
695 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000696 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700697 with self.assertRaises(UnicodeDecodeError):
698 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200699 with self.assertRaises(UnicodeDecodeError):
700 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000701
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200702@unittest.skipUnless(sys.platform == 'win32',
703 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200704class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200705 encoding = "cp65001"
706
707 def test_encode(self):
708 tests = [
709 ('abc', 'strict', b'abc'),
710 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
711 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
712 ]
713 if VISTA_OR_LATER:
714 tests.extend((
715 ('\udc80', 'strict', None),
716 ('\udc80', 'ignore', b''),
717 ('\udc80', 'replace', b'?'),
718 ('\udc80', 'backslashreplace', b'\\udc80'),
719 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
720 ))
721 else:
722 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
723 for text, errors, expected in tests:
724 if expected is not None:
725 try:
726 encoded = text.encode('cp65001', errors)
727 except UnicodeEncodeError as err:
728 self.fail('Unable to encode %a to cp65001 with '
729 'errors=%r: %s' % (text, errors, err))
730 self.assertEqual(encoded, expected,
731 '%a.encode("cp65001", %r)=%a != %a'
732 % (text, errors, encoded, expected))
733 else:
734 self.assertRaises(UnicodeEncodeError,
735 text.encode, "cp65001", errors)
736
737 def test_decode(self):
738 tests = [
739 (b'abc', 'strict', 'abc'),
740 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
741 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
742 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
743 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
744 # invalid bytes
745 (b'[\xff]', 'strict', None),
746 (b'[\xff]', 'ignore', '[]'),
747 (b'[\xff]', 'replace', '[\ufffd]'),
748 (b'[\xff]', 'surrogateescape', '[\udcff]'),
749 ]
750 if VISTA_OR_LATER:
751 tests.extend((
752 (b'[\xed\xb2\x80]', 'strict', None),
753 (b'[\xed\xb2\x80]', 'ignore', '[]'),
754 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
755 ))
756 else:
757 tests.extend((
758 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
759 ))
760 for raw, errors, expected in tests:
761 if expected is not None:
762 try:
763 decoded = raw.decode('cp65001', errors)
764 except UnicodeDecodeError as err:
765 self.fail('Unable to decode %a from cp65001 with '
766 'errors=%r: %s' % (raw, errors, err))
767 self.assertEqual(decoded, expected,
768 '%a.decode("cp65001", %r)=%a != %a'
769 % (raw, errors, decoded, expected))
770 else:
771 self.assertRaises(UnicodeDecodeError,
772 raw.decode, 'cp65001', errors)
773
774 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
775 def test_lone_surrogates(self):
776 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
777 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
778 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
779 b'[\\udc80]')
780 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
781 b'[&#56448;]')
782 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
783 b'[\x80]')
784 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
785 b'[]')
786 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
787 b'[?]')
788
789 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
790 def test_surrogatepass_handler(self):
791 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
792 b"abc\xed\xa0\x80def")
793 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
794 "abc\ud800def")
795 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
796 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
797 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
798 "\U00010fff\uD800")
799 self.assertTrue(codecs.lookup_error("surrogatepass"))
800
801
802
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200803class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000804 encoding = "utf-7"
805
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000806 def test_partial(self):
807 self.check_partial(
808 "a+-b",
809 [
810 "a",
811 "a",
812 "a+",
813 "a+-",
814 "a+-b",
815 ]
816 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000817
818class UTF16ExTest(unittest.TestCase):
819
820 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000821 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000822
823 def test_bad_args(self):
824 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
825
826class ReadBufferTest(unittest.TestCase):
827
828 def test_array(self):
829 import array
830 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000831 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000832 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000833 )
834
835 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000836 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000837
838 def test_bad_args(self):
839 self.assertRaises(TypeError, codecs.readbuffer_encode)
840 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
841
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200842class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000843 encoding = "utf-8-sig"
844
845 def test_partial(self):
846 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200847 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000848 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 "",
850 "",
851 "", # First BOM has been read and skipped
852 "",
853 "",
854 "\ufeff", # Second BOM has been read and emitted
855 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000856 "\ufeff\x00", # First byte of encoded "\xff" read
857 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
858 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
859 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000860 "\ufeff\x00\xff\u07ff",
861 "\ufeff\x00\xff\u07ff",
862 "\ufeff\x00\xff\u07ff\u0800",
863 "\ufeff\x00\xff\u07ff\u0800",
864 "\ufeff\x00\xff\u07ff\u0800",
865 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200866 "\ufeff\x00\xff\u07ff\u0800\uffff",
867 "\ufeff\x00\xff\u07ff\u0800\uffff",
868 "\ufeff\x00\xff\u07ff\u0800\uffff",
869 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000870 ]
871 )
872
Thomas Wouters89f507f2006-12-13 04:49:30 +0000873 def test_bug1601501(self):
874 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000875 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000876
Walter Dörwald3abcb012007-04-16 22:10:50 +0000877 def test_bom(self):
878 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000879 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000880 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
881
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000882 def test_stream_bom(self):
883 unistring = "ABC\u00A1\u2200XYZ"
884 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
885
886 reader = codecs.getreader("utf-8-sig")
887 for sizehint in [None] + list(range(1, 11)) + \
888 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200889 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000890 ostream = io.StringIO()
891 while 1:
892 if sizehint is not None:
893 data = istream.read(sizehint)
894 else:
895 data = istream.read()
896
897 if not data:
898 break
899 ostream.write(data)
900
901 got = ostream.getvalue()
902 self.assertEqual(got, unistring)
903
904 def test_stream_bare(self):
905 unistring = "ABC\u00A1\u2200XYZ"
906 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
907
908 reader = codecs.getreader("utf-8-sig")
909 for sizehint in [None] + list(range(1, 11)) + \
910 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200911 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000912 ostream = io.StringIO()
913 while 1:
914 if sizehint is not None:
915 data = istream.read(sizehint)
916 else:
917 data = istream.read()
918
919 if not data:
920 break
921 ostream.write(data)
922
923 got = ostream.getvalue()
924 self.assertEqual(got, unistring)
925
926class EscapeDecodeTest(unittest.TestCase):
927 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200928 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000929
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000930class RecodingTest(unittest.TestCase):
931 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000932 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200933 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000934 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000935 f2.close()
936 # Python used to crash on this at exit because of a refcount
937 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000938
Martin v. Löwis2548c732003-04-18 10:39:54 +0000939# From RFC 3492
940punycode_testcases = [
941 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000942 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
943 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000944 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000946 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000947 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000948 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000949 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000950 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
953 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
954 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000955 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000956 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000957 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
958 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
959 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000960 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000962 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000963 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
964 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
965 "\u0939\u0948\u0902",
966 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967
968 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000969 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000970 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
971 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972
973 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000974 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
975 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
976 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000977 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
978 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979
980 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000981 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
982 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
983 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
984 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000985 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986
987 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000988 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
989 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
990 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
991 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
992 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000993 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994
995 # (K) Vietnamese:
996 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
997 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
999 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1000 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1001 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001002 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001006 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001007
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001009 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1010 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1011 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001012 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001013
1014 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001015 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1016 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1017 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001018 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001019
1020 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001021 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001022 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023
1024 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001025 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1026 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001027 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028
1029 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001030 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001031 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032
1033 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001035 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036
1037 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001038 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1039 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001040 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 ]
1042
1043for i in punycode_testcases:
1044 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001045 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046
1047class PunycodeTest(unittest.TestCase):
1048 def test_encode(self):
1049 for uni, puny in punycode_testcases:
1050 # Need to convert both strings to lower case, since
1051 # some of the extended encodings use upper case, but our
1052 # code produces only lower case. Converting just puny to
1053 # lower is also insufficient, since some of the input characters
1054 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001055 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001056 str(uni.encode("punycode"), "ascii").lower(),
1057 str(puny, "ascii").lower()
1058 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059
1060 def test_decode(self):
1061 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001062 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001063 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001064 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001066class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001067 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001068 def test_bug1251300(self):
1069 # Decoding with unicode_internal used to not correctly handle "code
1070 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001071 ok = [
1072 (b"\x00\x10\xff\xff", "\U0010ffff"),
1073 (b"\x00\x00\x01\x01", "\U00000101"),
1074 (b"", ""),
1075 ]
1076 not_ok = [
1077 b"\x7f\xff\xff\xff",
1078 b"\x80\x00\x00\x00",
1079 b"\x81\x00\x00\x00",
1080 b"\x00",
1081 b"\x00\x00\x00\x00\x00",
1082 ]
1083 for internal, uni in ok:
1084 if sys.byteorder == "little":
1085 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001086 with support.check_warnings():
1087 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001088 for internal in not_ok:
1089 if sys.byteorder == "little":
1090 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001091 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001092 'deprecated', DeprecationWarning)):
1093 self.assertRaises(UnicodeDecodeError, internal.decode,
1094 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001095 if sys.byteorder == "little":
1096 invalid = b"\x00\x00\x11\x00"
1097 else:
1098 invalid = b"\x00\x11\x00\x00"
1099 with support.check_warnings():
1100 self.assertRaises(UnicodeDecodeError,
1101 invalid.decode, "unicode_internal")
1102 with support.check_warnings():
1103 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1104 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001105
Victor Stinner182d90d2011-09-29 19:53:55 +02001106 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001107 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001108 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001109 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001110 'deprecated', DeprecationWarning)):
1111 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001112 except UnicodeDecodeError as ex:
1113 self.assertEqual("unicode_internal", ex.encoding)
1114 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1115 self.assertEqual(4, ex.start)
1116 self.assertEqual(8, ex.end)
1117 else:
1118 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001119
Victor Stinner182d90d2011-09-29 19:53:55 +02001120 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001121 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001122 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1123 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001124 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001125 'deprecated', DeprecationWarning)):
1126 ab = "ab".encode("unicode_internal").decode()
1127 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1128 "ascii"),
1129 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001130 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001131
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001132 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001133 with support.check_warnings(('unicode_internal codec has been '
1134 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001135 # Issue 3739
1136 encoder = codecs.getencoder("unicode_internal")
1137 self.assertEqual(encoder("a")[1], 1)
1138 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1139
1140 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001141
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1143nameprep_tests = [
1144 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001145 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1146 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1147 b'\xb8\x8f\xef\xbb\xbf',
1148 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001150 (b'CAFE',
1151 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 # 3.3 Case folding 8bit U+00DF (german sharp s).
1153 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001154 (b'\xc3\x9f',
1155 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001157 (b'\xc4\xb0',
1158 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001160 (b'\xc5\x83\xcd\xba',
1161 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001162 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1163 # XXX: skip this as it fails in UCS-2 mode
1164 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1165 # 'telc\xe2\x88\x95kg\xcf\x83'),
1166 (None, None),
1167 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1169 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 (b'\xe1\xbe\xb7',
1172 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 # 3.9 Self-reverting case folding U+01F0 and normalization.
1174 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001175 (b'\xc7\xb0',
1176 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001177 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 (b'\xce\x90',
1179 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001180 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001181 (b'\xce\xb0',
1182 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001184 (b'\xe1\xba\x96',
1185 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001186 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001187 (b'\xe1\xbd\x96',
1188 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 (b' ',
1191 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001192 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001193 (b'\xc2\xa0',
1194 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001195 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001196 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197 None),
1198 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001199 (b'\xe2\x80\x80',
1200 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001201 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001202 (b'\xe2\x80\x8b',
1203 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001204 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001205 (b'\xe3\x80\x80',
1206 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001208 (b'\x10\x7f',
1209 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001211 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 None),
1213 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001214 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 None),
1216 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001217 (b'\xef\xbb\xbf',
1218 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001220 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001221 None),
1222 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001223 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 None),
1225 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001226 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 None),
1228 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001229 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 None),
1231 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001232 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233 None),
1234 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001235 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 None),
1237 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001238 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239 None),
1240 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001241 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242 None),
1243 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001244 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 None),
1246 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001247 (b'\xcd\x81',
1248 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001250 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 None),
1252 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001253 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 None),
1255 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001256 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 None),
1258 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001259 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 None),
1261 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001262 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263 None),
1264 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001265 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266 None),
1267 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001268 (b'foo\xef\xb9\xb6bar',
1269 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001271 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272 None),
1273 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001274 (b'\xd8\xa71\xd8\xa8',
1275 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001277 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001278 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001279 # None),
1280 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281 # 3.44 Larger test (shrinking).
1282 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001283 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1284 b'\xaa\xce\xb0\xe2\x80\x80',
1285 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286 # 3.45 Larger test (expanding).
1287 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1289 b'\x80',
1290 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1291 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1292 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293 ]
1294
1295
1296class NameprepTest(unittest.TestCase):
1297 def test_nameprep(self):
1298 from encodings.idna import nameprep
1299 for pos, (orig, prepped) in enumerate(nameprep_tests):
1300 if orig is None:
1301 # Skipped
1302 continue
1303 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001304 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305 if prepped is None:
1306 # Input contains prohibited characters
1307 self.assertRaises(UnicodeError, nameprep, orig)
1308 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001309 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001311 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001312 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001313 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001315class IDNACodecTest(unittest.TestCase):
1316 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001317 self.assertEqual(str(b"python.org", "idna"), "python.org")
1318 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1319 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1320 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001321
1322 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001323 self.assertEqual("python.org".encode("idna"), b"python.org")
1324 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1325 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1326 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001327
Martin v. Löwis8b595142005-08-25 11:03:38 +00001328 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001329 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001330 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001331 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001332
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001333 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001334 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001335 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001336 "python.org"
1337 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001338 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001339 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001340 "python.org."
1341 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001342 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001343 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 "pyth\xf6n.org."
1345 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001346 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001347 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001348 "pyth\xf6n.org."
1349 )
1350
1351 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001352 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1353 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1354 self.assertEqual(decoder.decode(b"rg"), "")
1355 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001356
1357 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001358 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1359 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1360 self.assertEqual(decoder.decode(b"rg."), "org.")
1361 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001362
1363 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001364 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 b"".join(codecs.iterencode("python.org", "idna")),
1366 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001367 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001368 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001369 b"".join(codecs.iterencode("python.org.", "idna")),
1370 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001371 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001372 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1374 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001375 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001376 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1378 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379 )
1380
1381 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001382 self.assertEqual(encoder.encode("\xe4x"), b"")
1383 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1384 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001385
1386 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001387 self.assertEqual(encoder.encode("\xe4x"), b"")
1388 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1389 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001391class CodecsModuleTest(unittest.TestCase):
1392
1393 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001394 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1395 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001396 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001397 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001398 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001399
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001400 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001401 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1402 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001403 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001404 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001405 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001406 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001407
1408 def test_register(self):
1409 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001410 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001411
1412 def test_lookup(self):
1413 self.assertRaises(TypeError, codecs.lookup)
1414 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001415 self.assertRaises(LookupError, codecs.lookup, " ")
1416
1417 def test_getencoder(self):
1418 self.assertRaises(TypeError, codecs.getencoder)
1419 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1420
1421 def test_getdecoder(self):
1422 self.assertRaises(TypeError, codecs.getdecoder)
1423 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1424
1425 def test_getreader(self):
1426 self.assertRaises(TypeError, codecs.getreader)
1427 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1428
1429 def test_getwriter(self):
1430 self.assertRaises(TypeError, codecs.getwriter)
1431 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001432
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001433 def test_lookup_issue1813(self):
1434 # Issue #1813: under Turkish locales, lookup of some codecs failed
1435 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001436 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001437 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1438 try:
1439 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1440 except locale.Error:
1441 # Unsupported locale on this system
1442 self.skipTest('test needs Turkish locale')
1443 c = codecs.lookup('ASCII')
1444 self.assertEqual(c.name, 'ascii')
1445
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001446class StreamReaderTest(unittest.TestCase):
1447
1448 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001449 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001450 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001451
1452 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001453 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001454 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001455
Thomas Wouters89f507f2006-12-13 04:49:30 +00001456class EncodedFileTest(unittest.TestCase):
1457
1458 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001459 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001460 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001461 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001462
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001463 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001464 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001465 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001466 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001467
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001468all_unicode_encodings = [
1469 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001470 "big5",
1471 "big5hkscs",
1472 "charmap",
1473 "cp037",
1474 "cp1006",
1475 "cp1026",
1476 "cp1140",
1477 "cp1250",
1478 "cp1251",
1479 "cp1252",
1480 "cp1253",
1481 "cp1254",
1482 "cp1255",
1483 "cp1256",
1484 "cp1257",
1485 "cp1258",
1486 "cp424",
1487 "cp437",
1488 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001489 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001490 "cp737",
1491 "cp775",
1492 "cp850",
1493 "cp852",
1494 "cp855",
1495 "cp856",
1496 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001497 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001498 "cp860",
1499 "cp861",
1500 "cp862",
1501 "cp863",
1502 "cp864",
1503 "cp865",
1504 "cp866",
1505 "cp869",
1506 "cp874",
1507 "cp875",
1508 "cp932",
1509 "cp949",
1510 "cp950",
1511 "euc_jis_2004",
1512 "euc_jisx0213",
1513 "euc_jp",
1514 "euc_kr",
1515 "gb18030",
1516 "gb2312",
1517 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001518 "hp_roman8",
1519 "hz",
1520 "idna",
1521 "iso2022_jp",
1522 "iso2022_jp_1",
1523 "iso2022_jp_2",
1524 "iso2022_jp_2004",
1525 "iso2022_jp_3",
1526 "iso2022_jp_ext",
1527 "iso2022_kr",
1528 "iso8859_1",
1529 "iso8859_10",
1530 "iso8859_11",
1531 "iso8859_13",
1532 "iso8859_14",
1533 "iso8859_15",
1534 "iso8859_16",
1535 "iso8859_2",
1536 "iso8859_3",
1537 "iso8859_4",
1538 "iso8859_5",
1539 "iso8859_6",
1540 "iso8859_7",
1541 "iso8859_8",
1542 "iso8859_9",
1543 "johab",
1544 "koi8_r",
1545 "koi8_u",
1546 "latin_1",
1547 "mac_cyrillic",
1548 "mac_greek",
1549 "mac_iceland",
1550 "mac_latin2",
1551 "mac_roman",
1552 "mac_turkish",
1553 "palmos",
1554 "ptcp154",
1555 "punycode",
1556 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001557 "shift_jis",
1558 "shift_jis_2004",
1559 "shift_jisx0213",
1560 "tis_620",
1561 "unicode_escape",
1562 "unicode_internal",
1563 "utf_16",
1564 "utf_16_be",
1565 "utf_16_le",
1566 "utf_7",
1567 "utf_8",
1568]
1569
1570if hasattr(codecs, "mbcs_encode"):
1571 all_unicode_encodings.append("mbcs")
1572
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001573# The following encoding is not tested, because it's not supposed
1574# to work:
1575# "undefined"
1576
1577# The following encodings don't work in stateful mode
1578broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001579 "punycode",
1580 "unicode_internal"
1581]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001582broken_incremental_coders = broken_unicode_with_streams + [
1583 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001584]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001585
Walter Dörwald3abcb012007-04-16 22:10:50 +00001586class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001587 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001588 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001589 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001590 name = codecs.lookup(encoding).name
1591 if encoding.endswith("_codec"):
1592 name += "_codec"
1593 elif encoding == "latin_1":
1594 name = "latin_1"
1595 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001596
Ezio Melottiadc417c2011-11-17 12:23:34 +02001597 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001598 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001599 (b, size) = codecs.getencoder(encoding)(s)
1600 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1601 (chars, size) = codecs.getdecoder(encoding)(b)
1602 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001603
1604 if encoding not in broken_unicode_with_streams:
1605 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001606 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001607 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001608 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001609 for c in s:
1610 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001611 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001612 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001613 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001614 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001615 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001616 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001617 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001618 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001619 decodedresult += reader.read()
1620 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1621
Thomas Wouters89f507f2006-12-13 04:49:30 +00001622 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001623 # check incremental decoder/encoder (fetched via the Python
1624 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001625 try:
1626 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001627 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001628 except LookupError: # no IncrementalEncoder
1629 pass
1630 else:
1631 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001632 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001633 for c in s:
1634 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001635 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001636 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001637 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001638 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001639 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001640 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001641 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1642
1643 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001644 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001645 for c in s:
1646 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001647 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001648 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001650 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001651 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001652 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001653 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1654
1655 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001656 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001657 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1658
1659 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001660 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1661 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001662
Victor Stinner554f3f02010-06-16 23:33:54 +00001663 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001664 # check incremental decoder/encoder with errors argument
1665 try:
1666 encoder = codecs.getincrementalencoder(encoding)("ignore")
1667 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1668 except LookupError: # no IncrementalEncoder
1669 pass
1670 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001671 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001672 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001673 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001674 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1675
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001676 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001677 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001678 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001679 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1680
Walter Dörwald729c31f2005-03-14 19:06:30 +00001681 def test_seek(self):
1682 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001683 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001684 for encoding in all_unicode_encodings:
1685 if encoding == "idna": # FIXME: See SF bug #1163178
1686 continue
1687 if encoding in broken_unicode_with_streams:
1688 continue
Victor Stinner05010702011-05-27 16:50:40 +02001689 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001690 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001691 # Test that calling seek resets the internal codec state and buffers
1692 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001693 data = reader.read()
1694 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001695
Walter Dörwalde22d3392005-11-17 08:52:34 +00001696 def test_bad_decode_args(self):
1697 for encoding in all_unicode_encodings:
1698 decoder = codecs.getdecoder(encoding)
1699 self.assertRaises(TypeError, decoder)
1700 if encoding not in ("idna", "punycode"):
1701 self.assertRaises(TypeError, decoder, 42)
1702
1703 def test_bad_encode_args(self):
1704 for encoding in all_unicode_encodings:
1705 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001706 with support.check_warnings():
1707 # unicode-internal has been deprecated
1708 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001709
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001710 def test_encoding_map_type_initialized(self):
1711 from encodings import cp1140
1712 # This used to crash, we are only verifying there's no crash.
1713 table_type = type(cp1140.encoding_table)
1714 self.assertEqual(table_type, table_type)
1715
Walter Dörwald3abcb012007-04-16 22:10:50 +00001716 def test_decoder_state(self):
1717 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001718 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001719 for encoding in all_unicode_encodings:
1720 if encoding not in broken_incremental_coders:
1721 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1722 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1723
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001724class CharmapTest(unittest.TestCase):
1725 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001726 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001727 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001728 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001729 )
1730
Ezio Melottib3aedd42010-11-20 19:04:17 +00001731 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001732 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1733 ("\U0010FFFFbc", 3)
1734 )
1735
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001736 self.assertRaises(UnicodeDecodeError,
1737 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1738 )
1739
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001740 self.assertRaises(UnicodeDecodeError,
1741 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1742 )
1743
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001744 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001745 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001746 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001747 )
1748
Ezio Melottib3aedd42010-11-20 19:04:17 +00001749 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001750 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001751 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001752 )
1753
Ezio Melottib3aedd42010-11-20 19:04:17 +00001754 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001755 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001756 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001757 )
1758
Ezio Melottib3aedd42010-11-20 19:04:17 +00001759 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001760 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001761 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001762 )
1763
Guido van Rossum805365e2007-05-07 22:24:25 +00001764 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001765 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001766 codecs.charmap_decode(allbytes, "ignore", ""),
1767 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001768 )
1769
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001770 def test_decode_with_int2str_map(self):
1771 self.assertEqual(
1772 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1773 {0: 'a', 1: 'b', 2: 'c'}),
1774 ("abc", 3)
1775 )
1776
1777 self.assertEqual(
1778 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1779 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1780 ("AaBbCc", 3)
1781 )
1782
1783 self.assertEqual(
1784 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1785 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1786 ("\U0010FFFFbc", 3)
1787 )
1788
1789 self.assertEqual(
1790 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1791 {0: 'a', 1: 'b', 2: ''}),
1792 ("ab", 3)
1793 )
1794
1795 self.assertRaises(UnicodeDecodeError,
1796 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1797 {0: 'a', 1: 'b'}
1798 )
1799
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001800 self.assertRaises(UnicodeDecodeError,
1801 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1802 {0: 'a', 1: 'b', 2: None}
1803 )
1804
1805 # Issue #14850
1806 self.assertRaises(UnicodeDecodeError,
1807 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1808 {0: 'a', 1: 'b', 2: '\ufffe'}
1809 )
1810
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001811 self.assertEqual(
1812 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1813 {0: 'a', 1: 'b'}),
1814 ("ab\ufffd", 3)
1815 )
1816
1817 self.assertEqual(
1818 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1819 {0: 'a', 1: 'b', 2: None}),
1820 ("ab\ufffd", 3)
1821 )
1822
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001823 # Issue #14850
1824 self.assertEqual(
1825 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1826 {0: 'a', 1: 'b', 2: '\ufffe'}),
1827 ("ab\ufffd", 3)
1828 )
1829
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001830 self.assertEqual(
1831 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1832 {0: 'a', 1: 'b'}),
1833 ("ab", 3)
1834 )
1835
1836 self.assertEqual(
1837 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1838 {0: 'a', 1: 'b', 2: None}),
1839 ("ab", 3)
1840 )
1841
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001842 # Issue #14850
1843 self.assertEqual(
1844 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1845 {0: 'a', 1: 'b', 2: '\ufffe'}),
1846 ("ab", 3)
1847 )
1848
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001849 allbytes = bytes(range(256))
1850 self.assertEqual(
1851 codecs.charmap_decode(allbytes, "ignore", {}),
1852 ("", len(allbytes))
1853 )
1854
1855 def test_decode_with_int2int_map(self):
1856 a = ord('a')
1857 b = ord('b')
1858 c = ord('c')
1859
1860 self.assertEqual(
1861 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1862 {0: a, 1: b, 2: c}),
1863 ("abc", 3)
1864 )
1865
1866 # Issue #15379
1867 self.assertEqual(
1868 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1869 {0: 0x10FFFF, 1: b, 2: c}),
1870 ("\U0010FFFFbc", 3)
1871 )
1872
Antoine Pitroua1f76552012-09-23 20:00:04 +02001873 self.assertEqual(
1874 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1875 {0: sys.maxunicode, 1: b, 2: c}),
1876 (chr(sys.maxunicode) + "bc", 3)
1877 )
1878
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001879 self.assertRaises(TypeError,
1880 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001881 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001882 )
1883
1884 self.assertRaises(UnicodeDecodeError,
1885 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1886 {0: a, 1: b},
1887 )
1888
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001889 self.assertRaises(UnicodeDecodeError,
1890 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1891 {0: a, 1: b, 2: 0xFFFE},
1892 )
1893
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001894 self.assertEqual(
1895 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1896 {0: a, 1: b}),
1897 ("ab\ufffd", 3)
1898 )
1899
1900 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001901 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1902 {0: a, 1: b, 2: 0xFFFE}),
1903 ("ab\ufffd", 3)
1904 )
1905
1906 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001907 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1908 {0: a, 1: b}),
1909 ("ab", 3)
1910 )
1911
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001912 self.assertEqual(
1913 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1914 {0: a, 1: b, 2: 0xFFFE}),
1915 ("ab", 3)
1916 )
1917
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001918
Thomas Wouters89f507f2006-12-13 04:49:30 +00001919class WithStmtTest(unittest.TestCase):
1920 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001921 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001922 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1923 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001924
1925 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001926 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001927 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001928 with codecs.StreamReaderWriter(f, info.streamreader,
1929 info.streamwriter, 'strict') as srw:
1930 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001931
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001932class TypesTest(unittest.TestCase):
1933 def test_decode_unicode(self):
1934 # Most decoders don't accept unicode input
1935 decoders = [
1936 codecs.utf_7_decode,
1937 codecs.utf_8_decode,
1938 codecs.utf_16_le_decode,
1939 codecs.utf_16_be_decode,
1940 codecs.utf_16_ex_decode,
1941 codecs.utf_32_decode,
1942 codecs.utf_32_le_decode,
1943 codecs.utf_32_be_decode,
1944 codecs.utf_32_ex_decode,
1945 codecs.latin_1_decode,
1946 codecs.ascii_decode,
1947 codecs.charmap_decode,
1948 ]
1949 if hasattr(codecs, "mbcs_decode"):
1950 decoders.append(codecs.mbcs_decode)
1951 for decoder in decoders:
1952 self.assertRaises(TypeError, decoder, "xxx")
1953
1954 def test_unicode_escape(self):
1955 # Escape-decoding an unicode string is supported ang gives the same
1956 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001957 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1958 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1959 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1960 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001961
Victor Stinnere3b47152011-12-09 20:49:49 +01001962 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
1963 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1964
1965 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
1966 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1967
Martin v. Löwis43c57782009-05-10 08:15:24 +00001968class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001969
1970 def test_utf8(self):
1971 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001972 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001973 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001974 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001975 b"foo\x80bar")
1976 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001977 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001978 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001979 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001980 b"\xed\xb0\x80")
1981
1982 def test_ascii(self):
1983 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001984 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001985 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001986 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001987 b"foo\x80bar")
1988
1989 def test_charmap(self):
1990 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001991 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001992 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001993 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001994 b"foo\xa5bar")
1995
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001996 def test_latin1(self):
1997 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001998 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001999 b"\xe4\xeb\xef\xf6\xfc")
2000
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002001
Victor Stinner3fed0872010-05-22 02:16:27 +00002002class BomTest(unittest.TestCase):
2003 def test_seek0(self):
2004 data = "1234567890"
2005 tests = ("utf-16",
2006 "utf-16-le",
2007 "utf-16-be",
2008 "utf-32",
2009 "utf-32-le",
2010 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002011 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002012 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002013 # Check if the BOM is written only once
2014 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002015 f.write(data)
2016 f.write(data)
2017 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002018 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002019 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002020 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002021
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002022 # Check that the BOM is written after a seek(0)
2023 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2024 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002025 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002026 f.seek(0)
2027 f.write(data)
2028 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002029 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002030
2031 # (StreamWriter) Check that the BOM is written after a seek(0)
2032 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002033 f.writer.write(data[0])
2034 self.assertNotEqual(f.writer.tell(), 0)
2035 f.writer.seek(0)
2036 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002037 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002038 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002039
Victor Stinner05010702011-05-27 16:50:40 +02002040 # Check that the BOM is not written after a seek() at a position
2041 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002042 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2043 f.write(data)
2044 f.seek(f.tell())
2045 f.write(data)
2046 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002047 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002048
Victor Stinner05010702011-05-27 16:50:40 +02002049 # (StreamWriter) Check that the BOM is not written after a seek()
2050 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002051 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002052 f.writer.write(data)
2053 f.writer.seek(f.writer.tell())
2054 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002055 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002056 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002057
Victor Stinner3fed0872010-05-22 02:16:27 +00002058
Georg Brandl02524622010-12-02 18:06:51 +00002059bytes_transform_encodings = [
2060 "base64_codec",
2061 "uu_codec",
2062 "quopri_codec",
2063 "hex_codec",
2064]
2065try:
2066 import zlib
2067except ImportError:
2068 pass
2069else:
2070 bytes_transform_encodings.append("zlib_codec")
2071try:
2072 import bz2
2073except ImportError:
2074 pass
2075else:
2076 bytes_transform_encodings.append("bz2_codec")
2077
2078class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002079
Georg Brandl02524622010-12-02 18:06:51 +00002080 def test_basics(self):
2081 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002082 for encoding in bytes_transform_encodings:
2083 # generic codecs interface
2084 (o, size) = codecs.getencoder(encoding)(binput)
2085 self.assertEqual(size, len(binput))
2086 (i, size) = codecs.getdecoder(encoding)(o)
2087 self.assertEqual(size, len(o))
2088 self.assertEqual(i, binput)
2089
Georg Brandl02524622010-12-02 18:06:51 +00002090 def test_read(self):
2091 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002092 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002093 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002094 sout = reader.read()
2095 self.assertEqual(sout, b"\x80")
2096
2097 def test_readline(self):
2098 for encoding in bytes_transform_encodings:
2099 if encoding in ['uu_codec', 'zlib_codec']:
2100 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002101 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002102 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002103 sout = reader.readline()
2104 self.assertEqual(sout, b"\x80")
2105
2106
Victor Stinner62be4fb2011-10-18 21:46:37 +02002107@unittest.skipUnless(sys.platform == 'win32',
2108 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002109class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002110 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002111 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002112
Victor Stinner3a50e702011-10-18 21:21:00 +02002113 def test_invalid_code_page(self):
2114 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2115 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2116 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2117 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2118
2119 def test_code_page_name(self):
2120 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2121 codecs.code_page_encode, 932, '\xff')
2122 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2123 codecs.code_page_decode, 932, b'\x81\x00')
2124 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2125 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2126
2127 def check_decode(self, cp, tests):
2128 for raw, errors, expected in tests:
2129 if expected is not None:
2130 try:
2131 decoded = codecs.code_page_decode(cp, raw, errors)
2132 except UnicodeDecodeError as err:
2133 self.fail('Unable to decode %a from "cp%s" with '
2134 'errors=%r: %s' % (raw, cp, errors, err))
2135 self.assertEqual(decoded[0], expected,
2136 '%a.decode("cp%s", %r)=%a != %a'
2137 % (raw, cp, errors, decoded[0], expected))
2138 # assert 0 <= decoded[1] <= len(raw)
2139 self.assertGreaterEqual(decoded[1], 0)
2140 self.assertLessEqual(decoded[1], len(raw))
2141 else:
2142 self.assertRaises(UnicodeDecodeError,
2143 codecs.code_page_decode, cp, raw, errors)
2144
2145 def check_encode(self, cp, tests):
2146 for text, errors, expected in tests:
2147 if expected is not None:
2148 try:
2149 encoded = codecs.code_page_encode(cp, text, errors)
2150 except UnicodeEncodeError as err:
2151 self.fail('Unable to encode %a to "cp%s" with '
2152 'errors=%r: %s' % (text, cp, errors, err))
2153 self.assertEqual(encoded[0], expected,
2154 '%a.encode("cp%s", %r)=%a != %a'
2155 % (text, cp, errors, encoded[0], expected))
2156 self.assertEqual(encoded[1], len(text))
2157 else:
2158 self.assertRaises(UnicodeEncodeError,
2159 codecs.code_page_encode, cp, text, errors)
2160
2161 def test_cp932(self):
2162 self.check_encode(932, (
2163 ('abc', 'strict', b'abc'),
2164 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002165 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002166 ('\xff', 'strict', None),
2167 ('[\xff]', 'ignore', b'[]'),
2168 ('[\xff]', 'replace', b'[y]'),
2169 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002170 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2171 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002172 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002173 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002174 (b'abc', 'strict', 'abc'),
2175 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2176 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002177 (b'[\xff]', 'strict', None),
2178 (b'[\xff]', 'ignore', '[]'),
2179 (b'[\xff]', 'replace', '[\ufffd]'),
2180 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002181 (b'\x81\x00abc', 'strict', None),
2182 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002183 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2184 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002185
2186 def test_cp1252(self):
2187 self.check_encode(1252, (
2188 ('abc', 'strict', b'abc'),
2189 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2190 ('\xff', 'strict', b'\xff'),
2191 ('\u0141', 'strict', None),
2192 ('\u0141', 'ignore', b''),
2193 ('\u0141', 'replace', b'L'),
2194 ))
2195 self.check_decode(1252, (
2196 (b'abc', 'strict', 'abc'),
2197 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2198 (b'\xff', 'strict', '\xff'),
2199 ))
2200
2201 def test_cp_utf7(self):
2202 cp = 65000
2203 self.check_encode(cp, (
2204 ('abc', 'strict', b'abc'),
2205 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2206 ('\U0010ffff', 'strict', b'+2//f/w-'),
2207 ('\udc80', 'strict', b'+3IA-'),
2208 ('\ufffd', 'strict', b'+//0-'),
2209 ))
2210 self.check_decode(cp, (
2211 (b'abc', 'strict', 'abc'),
2212 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2213 (b'+2//f/w-', 'strict', '\U0010ffff'),
2214 (b'+3IA-', 'strict', '\udc80'),
2215 (b'+//0-', 'strict', '\ufffd'),
2216 # invalid bytes
2217 (b'[+/]', 'strict', '[]'),
2218 (b'[\xff]', 'strict', '[\xff]'),
2219 ))
2220
Victor Stinner3a50e702011-10-18 21:21:00 +02002221 def test_multibyte_encoding(self):
2222 self.check_decode(932, (
2223 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2224 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2225 ))
2226 self.check_decode(self.CP_UTF8, (
2227 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2228 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2229 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002230 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002231 self.check_encode(self.CP_UTF8, (
2232 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2233 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2234 ))
2235
2236 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002237 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2238 self.assertEqual(decoded, ('', 0))
2239
Victor Stinner3a50e702011-10-18 21:21:00 +02002240 decoded = codecs.code_page_decode(932,
2241 b'\xe9\x80\xe9', 'strict',
2242 False)
2243 self.assertEqual(decoded, ('\u9a3e', 2))
2244
2245 decoded = codecs.code_page_decode(932,
2246 b'\xe9\x80\xe9\x80', 'strict',
2247 False)
2248 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2249
2250 decoded = codecs.code_page_decode(932,
2251 b'abc', 'strict',
2252 False)
2253 self.assertEqual(decoded, ('abc', 3))
2254
2255
Fred Drake2e2be372001-09-20 21:33:42 +00002256if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002257 unittest.main()