blob: 67690b855f585e74b61e2f57fece9625efa8203f [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
Ezio Melotti5d3dba02013-01-11 06:02:07 +020077class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200297class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200333 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200355 "\x00\xff\u0100\uffff",
356 "\x00\xff\u0100\uffff",
357 "\x00\xff\u0100\uffff",
358 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000359 ]
360 )
361
Georg Brandl791f4e12009-09-17 11:41:24 +0000362 def test_handlers(self):
363 self.assertEqual(('\ufffd', 1),
364 codecs.utf_32_decode(b'\x01', 'replace', True))
365 self.assertEqual(('', 1),
366 codecs.utf_32_decode(b'\x01', 'ignore', True))
367
Walter Dörwald41980ca2007-08-16 21:55:45 +0000368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
370 b"\xff", "strict", True)
371
372 def test_decoder_state(self):
373 self.check_state_handling_decode(self.encoding,
374 "spamspam", self.spamle)
375 self.check_state_handling_decode(self.encoding,
376 "spamspam", self.spambe)
377
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000378 def test_issue8941(self):
379 # Issue #8941: insufficient result allocation when decoding into
380 # surrogate pairs on UCS-2 builds.
381 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
382 self.assertEqual('\U00010000' * 1024,
383 codecs.utf_32_decode(encoded_le)[0])
384 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
385 self.assertEqual('\U00010000' * 1024,
386 codecs.utf_32_decode(encoded_be)[0])
387
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200388class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000389 encoding = "utf-32-le"
390
391 def test_partial(self):
392 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200393 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000394 [
395 "",
396 "",
397 "",
398 "\x00",
399 "\x00",
400 "\x00",
401 "\x00",
402 "\x00\xff",
403 "\x00\xff",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff\u0100",
407 "\x00\xff\u0100",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200411 "\x00\xff\u0100\uffff",
412 "\x00\xff\u0100\uffff",
413 "\x00\xff\u0100\uffff",
414 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415 ]
416 )
417
418 def test_simple(self):
419 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
420
421 def test_errors(self):
422 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
423 b"\xff", "strict", True)
424
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000425 def test_issue8941(self):
426 # Issue #8941: insufficient result allocation when decoding into
427 # surrogate pairs on UCS-2 builds.
428 encoded = b'\x00\x00\x01\x00' * 1024
429 self.assertEqual('\U00010000' * 1024,
430 codecs.utf_32_le_decode(encoded)[0])
431
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200432class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000433 encoding = "utf-32-be"
434
435 def test_partial(self):
436 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200437 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 [
439 "",
440 "",
441 "",
442 "\x00",
443 "\x00",
444 "\x00",
445 "\x00",
446 "\x00\xff",
447 "\x00\xff",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200455 "\x00\xff\u0100\uffff",
456 "\x00\xff\u0100\uffff",
457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000459 ]
460 )
461
462 def test_simple(self):
463 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
464
465 def test_errors(self):
466 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
467 b"\xff", "strict", True)
468
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000469 def test_issue8941(self):
470 # Issue #8941: insufficient result allocation when decoding into
471 # surrogate pairs on UCS-2 builds.
472 encoded = b'\x00\x01\x00\x00' * 1024
473 self.assertEqual('\U00010000' * 1024,
474 codecs.utf_32_be_decode(encoded)[0])
475
476
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200477class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000478 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000479
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000480 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
481 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000482
483 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000484 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000486 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200487 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000488 f.write("spam")
489 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490 d = s.getvalue()
491 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000492 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000493 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000494 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200495 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000496 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000497
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000499 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200500 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000501 self.assertRaises(UnicodeError, f.read)
502
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000503 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200504 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000505 self.assertRaises(UnicodeError, f.read)
506
Walter Dörwald69652032004-09-07 20:24:22 +0000507 def test_partial(self):
508 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200509 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000510 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000511 "", # first byte of BOM read
512 "", # second byte of BOM read => byteorder known
513 "",
514 "\x00",
515 "\x00",
516 "\x00\xff",
517 "\x00\xff",
518 "\x00\xff\u0100",
519 "\x00\xff\u0100",
520 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200521 "\x00\xff\u0100\uffff",
522 "\x00\xff\u0100\uffff",
523 "\x00\xff\u0100\uffff",
524 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000525 ]
526 )
527
Georg Brandl791f4e12009-09-17 11:41:24 +0000528 def test_handlers(self):
529 self.assertEqual(('\ufffd', 1),
530 codecs.utf_16_decode(b'\x01', 'replace', True))
531 self.assertEqual(('', 1),
532 codecs.utf_16_decode(b'\x01', 'ignore', True))
533
Walter Dörwalde22d3392005-11-17 08:52:34 +0000534 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000535 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000536 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000537
538 def test_decoder_state(self):
539 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000541 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000544 def test_bug691291(self):
545 # Files are always opened in binary mode, even if no binary mode was
546 # specified. This means that no automatic conversion of '\n' is done
547 # on reading and writing.
548 s1 = 'Hello\r\nworld\r\n'
549
550 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200551 self.addCleanup(support.unlink, support.TESTFN)
552 with open(support.TESTFN, 'wb') as fp:
553 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200554 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200555 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000556
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200557class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000558 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000559
560 def test_partial(self):
561 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200562 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000563 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000564 "",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100",
571 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff",
575 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 ]
577 )
578
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200580 tests = [
581 (b'\xff', '\ufffd'),
582 (b'A\x00Z', 'A\ufffd'),
583 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
584 (b'\x00\xd8', '\ufffd'),
585 (b'\x00\xd8A', '\ufffd'),
586 (b'\x00\xd8A\x00', '\ufffdA'),
587 (b'\x00\xdcA\x00', '\ufffdA'),
588 ]
589 for raw, expected in tests:
590 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
591 raw, 'strict', True)
592 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000593
Victor Stinner53a9dd72010-12-08 22:25:45 +0000594 def test_nonbmp(self):
595 self.assertEqual("\U00010203".encode(self.encoding),
596 b'\x00\xd8\x03\xde')
597 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
598 "\U00010203")
599
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200600class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000601 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000602
603 def test_partial(self):
604 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200605 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000606 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 "",
608 "\x00",
609 "\x00",
610 "\x00\xff",
611 "\x00\xff",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000619 ]
620 )
621
Walter Dörwalde22d3392005-11-17 08:52:34 +0000622 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200623 tests = [
624 (b'\xff', '\ufffd'),
625 (b'\x00A\xff', 'A\ufffd'),
626 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
627 (b'\xd8\x00', '\ufffd'),
628 (b'\xd8\x00\xdc', '\ufffd'),
629 (b'\xd8\x00\x00A', '\ufffdA'),
630 (b'\xdc\x00\x00A', '\ufffdA'),
631 ]
632 for raw, expected in tests:
633 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
634 raw, 'strict', True)
635 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Victor Stinner53a9dd72010-12-08 22:25:45 +0000637 def test_nonbmp(self):
638 self.assertEqual("\U00010203".encode(self.encoding),
639 b'\xd8\x00\xde\x03')
640 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
641 "\U00010203")
642
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200643class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000644 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000645
646 def test_partial(self):
647 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200648 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000649 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000650 "\x00",
651 "\x00",
652 "\x00\xff",
653 "\x00\xff",
654 "\x00\xff\u07ff",
655 "\x00\xff\u07ff",
656 "\x00\xff\u07ff",
657 "\x00\xff\u07ff\u0800",
658 "\x00\xff\u07ff\u0800",
659 "\x00\xff\u07ff\u0800",
660 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200661 "\x00\xff\u07ff\u0800\uffff",
662 "\x00\xff\u07ff\u0800\uffff",
663 "\x00\xff\u07ff\u0800\uffff",
664 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 ]
666 )
667
Walter Dörwald3abcb012007-04-16 22:10:50 +0000668 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000669 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000670 self.check_state_handling_decode(self.encoding,
671 u, u.encode(self.encoding))
672
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000673 def test_lone_surrogates(self):
674 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
675 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000676 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
677 b'[\\udc80]')
678 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
679 b'[&#56448;]')
680 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
681 b'[\x80]')
682 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
683 b'[]')
684 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
685 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000686
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000687 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000688 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
689 b"abc\xed\xa0\x80def")
690 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
691 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
693 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
694 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
695 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000696 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700697 with self.assertRaises(UnicodeDecodeError):
698 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200699 with self.assertRaises(UnicodeDecodeError):
700 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000701
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200702@unittest.skipUnless(sys.platform == 'win32',
703 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200704class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200705 encoding = "cp65001"
706
707 def test_encode(self):
708 tests = [
709 ('abc', 'strict', b'abc'),
710 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
711 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
712 ]
713 if VISTA_OR_LATER:
714 tests.extend((
715 ('\udc80', 'strict', None),
716 ('\udc80', 'ignore', b''),
717 ('\udc80', 'replace', b'?'),
718 ('\udc80', 'backslashreplace', b'\\udc80'),
719 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
720 ))
721 else:
722 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
723 for text, errors, expected in tests:
724 if expected is not None:
725 try:
726 encoded = text.encode('cp65001', errors)
727 except UnicodeEncodeError as err:
728 self.fail('Unable to encode %a to cp65001 with '
729 'errors=%r: %s' % (text, errors, err))
730 self.assertEqual(encoded, expected,
731 '%a.encode("cp65001", %r)=%a != %a'
732 % (text, errors, encoded, expected))
733 else:
734 self.assertRaises(UnicodeEncodeError,
735 text.encode, "cp65001", errors)
736
737 def test_decode(self):
738 tests = [
739 (b'abc', 'strict', 'abc'),
740 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
741 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
742 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
743 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
744 # invalid bytes
745 (b'[\xff]', 'strict', None),
746 (b'[\xff]', 'ignore', '[]'),
747 (b'[\xff]', 'replace', '[\ufffd]'),
748 (b'[\xff]', 'surrogateescape', '[\udcff]'),
749 ]
750 if VISTA_OR_LATER:
751 tests.extend((
752 (b'[\xed\xb2\x80]', 'strict', None),
753 (b'[\xed\xb2\x80]', 'ignore', '[]'),
754 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
755 ))
756 else:
757 tests.extend((
758 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
759 ))
760 for raw, errors, expected in tests:
761 if expected is not None:
762 try:
763 decoded = raw.decode('cp65001', errors)
764 except UnicodeDecodeError as err:
765 self.fail('Unable to decode %a from cp65001 with '
766 'errors=%r: %s' % (raw, errors, err))
767 self.assertEqual(decoded, expected,
768 '%a.decode("cp65001", %r)=%a != %a'
769 % (raw, errors, decoded, expected))
770 else:
771 self.assertRaises(UnicodeDecodeError,
772 raw.decode, 'cp65001', errors)
773
774 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
775 def test_lone_surrogates(self):
776 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
777 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
778 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
779 b'[\\udc80]')
780 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
781 b'[&#56448;]')
782 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
783 b'[\x80]')
784 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
785 b'[]')
786 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
787 b'[?]')
788
789 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
790 def test_surrogatepass_handler(self):
791 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
792 b"abc\xed\xa0\x80def")
793 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
794 "abc\ud800def")
795 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
796 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
797 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
798 "\U00010fff\uD800")
799 self.assertTrue(codecs.lookup_error("surrogatepass"))
800
801
802
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200803class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000804 encoding = "utf-7"
805
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000806 def test_partial(self):
807 self.check_partial(
808 "a+-b",
809 [
810 "a",
811 "a",
812 "a+",
813 "a+-",
814 "a+-b",
815 ]
816 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000817
818class UTF16ExTest(unittest.TestCase):
819
820 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000821 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000822
823 def test_bad_args(self):
824 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
825
826class ReadBufferTest(unittest.TestCase):
827
828 def test_array(self):
829 import array
830 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000831 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000832 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000833 )
834
835 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000836 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000837
838 def test_bad_args(self):
839 self.assertRaises(TypeError, codecs.readbuffer_encode)
840 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
841
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200842class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000843 encoding = "utf-8-sig"
844
845 def test_partial(self):
846 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200847 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000848 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 "",
850 "",
851 "", # First BOM has been read and skipped
852 "",
853 "",
854 "\ufeff", # Second BOM has been read and emitted
855 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000856 "\ufeff\x00", # First byte of encoded "\xff" read
857 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
858 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
859 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000860 "\ufeff\x00\xff\u07ff",
861 "\ufeff\x00\xff\u07ff",
862 "\ufeff\x00\xff\u07ff\u0800",
863 "\ufeff\x00\xff\u07ff\u0800",
864 "\ufeff\x00\xff\u07ff\u0800",
865 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200866 "\ufeff\x00\xff\u07ff\u0800\uffff",
867 "\ufeff\x00\xff\u07ff\u0800\uffff",
868 "\ufeff\x00\xff\u07ff\u0800\uffff",
869 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000870 ]
871 )
872
Thomas Wouters89f507f2006-12-13 04:49:30 +0000873 def test_bug1601501(self):
874 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000875 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000876
Walter Dörwald3abcb012007-04-16 22:10:50 +0000877 def test_bom(self):
878 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000879 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000880 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
881
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000882 def test_stream_bom(self):
883 unistring = "ABC\u00A1\u2200XYZ"
884 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
885
886 reader = codecs.getreader("utf-8-sig")
887 for sizehint in [None] + list(range(1, 11)) + \
888 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200889 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000890 ostream = io.StringIO()
891 while 1:
892 if sizehint is not None:
893 data = istream.read(sizehint)
894 else:
895 data = istream.read()
896
897 if not data:
898 break
899 ostream.write(data)
900
901 got = ostream.getvalue()
902 self.assertEqual(got, unistring)
903
904 def test_stream_bare(self):
905 unistring = "ABC\u00A1\u2200XYZ"
906 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
907
908 reader = codecs.getreader("utf-8-sig")
909 for sizehint in [None] + list(range(1, 11)) + \
910 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200911 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000912 ostream = io.StringIO()
913 while 1:
914 if sizehint is not None:
915 data = istream.read(sizehint)
916 else:
917 data = istream.read()
918
919 if not data:
920 break
921 ostream.write(data)
922
923 got = ostream.getvalue()
924 self.assertEqual(got, unistring)
925
926class EscapeDecodeTest(unittest.TestCase):
927 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200928 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000929
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200930 def test_raw(self):
931 for b in range(256):
932 if b != b'\\'[0]:
933 self.assertEqual(codecs.escape_decode(bytes([b]) + b'0'),
934 (bytes([b]) + b'0', 2))
935
936 def test_escape(self):
937 self.assertEqual(codecs.escape_decode(b"[\\\n]"), (b"[]", 4))
938 self.assertEqual(codecs.escape_decode(br'[\"]'), (b'["]', 4))
939 self.assertEqual(codecs.escape_decode(br"[\']"), (b"[']", 4))
940 self.assertEqual(codecs.escape_decode(br"[\\]"), (br"[\]", 4))
941 self.assertEqual(codecs.escape_decode(br"[\a]"), (b"[\x07]", 4))
942 self.assertEqual(codecs.escape_decode(br"[\b]"), (b"[\x08]", 4))
943 self.assertEqual(codecs.escape_decode(br"[\t]"), (b"[\x09]", 4))
944 self.assertEqual(codecs.escape_decode(br"[\n]"), (b"[\x0a]", 4))
945 self.assertEqual(codecs.escape_decode(br"[\v]"), (b"[\x0b]", 4))
946 self.assertEqual(codecs.escape_decode(br"[\f]"), (b"[\x0c]", 4))
947 self.assertEqual(codecs.escape_decode(br"[\r]"), (b"[\x0d]", 4))
948 self.assertEqual(codecs.escape_decode(br"[\7]"), (b"[\x07]", 4))
949 self.assertEqual(codecs.escape_decode(br"[\8]"), (br"[\8]", 4))
950 self.assertEqual(codecs.escape_decode(br"[\78]"), (b"[\x078]", 5))
951 self.assertEqual(codecs.escape_decode(br"[\41]"), (b"[!]", 5))
952 self.assertEqual(codecs.escape_decode(br"[\418]"), (b"[!8]", 6))
953 self.assertEqual(codecs.escape_decode(br"[\101]"), (b"[A]", 6))
954 self.assertEqual(codecs.escape_decode(br"[\1010]"), (b"[A0]", 7))
955 self.assertEqual(codecs.escape_decode(br"[\501]"), (b"[A]", 6))
956 self.assertEqual(codecs.escape_decode(br"[\x41]"), (b"[A]", 6))
957 self.assertEqual(codecs.escape_decode(br"[\X41]"), (br"[\X41]", 6))
958 self.assertEqual(codecs.escape_decode(br"[\x410]"), (b"[A0]", 7))
959 for b in range(256):
960 if b not in b'\n"\'\\abtnvfr01234567x':
961 self.assertEqual(codecs.escape_decode(b'\\' + bytes([b])),
962 (b'\\' + bytes([b]), 2))
963
964 def test_errors(self):
965 self.assertRaises(ValueError, codecs.escape_decode, br"\x")
966 self.assertRaises(ValueError, codecs.escape_decode, br"[\x]")
967 self.assertEqual(codecs.escape_decode(br"[\x]\x", "ignore"), (b"[]", 6))
968 self.assertEqual(codecs.escape_decode(br"[\x]\x", "replace"), (b"[?]?", 6))
969 self.assertRaises(ValueError, codecs.escape_decode, br"\x0")
970 self.assertRaises(ValueError, codecs.escape_decode, br"[\x0]")
971 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
972 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
973
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000974class RecodingTest(unittest.TestCase):
975 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000976 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200977 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000978 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000979 f2.close()
980 # Python used to crash on this at exit because of a refcount
981 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000982
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983# From RFC 3492
984punycode_testcases = [
985 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000986 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
987 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000988 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000989 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000990 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000991 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000993 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000994 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000996 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
997 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
998 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000999 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001001 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1002 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1003 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001004 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001006 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001007 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1008 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1009 "\u0939\u0948\u0902",
1010 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011
1012 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001013 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001014 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1015 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001016
1017 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001018 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1019 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1020 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001021 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1022 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023
1024 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001025 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1026 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1027 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1028 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001029 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001030
1031 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001032 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1033 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1034 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1035 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1036 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001037 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038
1039 # (K) Vietnamese:
1040 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1041 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001042 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1043 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1044 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1045 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001046 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001049 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001050 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001051
Martin v. Löwis2548c732003-04-18 10:39:54 +00001052 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001053 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1054 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1055 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001056 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057
1058 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001059 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1060 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1061 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001062 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063
1064 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001065 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001066 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001067
1068 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001069 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1070 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001071 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001072
1073 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001074 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001075 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076
1077 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001078 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001079 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080
1081 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001082 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1083 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001084 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085 ]
1086
1087for i in punycode_testcases:
1088 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001089 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001090
1091class PunycodeTest(unittest.TestCase):
1092 def test_encode(self):
1093 for uni, puny in punycode_testcases:
1094 # Need to convert both strings to lower case, since
1095 # some of the extended encodings use upper case, but our
1096 # code produces only lower case. Converting just puny to
1097 # lower is also insufficient, since some of the input characters
1098 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001099 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001100 str(uni.encode("punycode"), "ascii").lower(),
1101 str(puny, "ascii").lower()
1102 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001103
1104 def test_decode(self):
1105 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001106 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001107 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001108 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001109
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001110class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001111 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001112 def test_bug1251300(self):
1113 # Decoding with unicode_internal used to not correctly handle "code
1114 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001115 ok = [
1116 (b"\x00\x10\xff\xff", "\U0010ffff"),
1117 (b"\x00\x00\x01\x01", "\U00000101"),
1118 (b"", ""),
1119 ]
1120 not_ok = [
1121 b"\x7f\xff\xff\xff",
1122 b"\x80\x00\x00\x00",
1123 b"\x81\x00\x00\x00",
1124 b"\x00",
1125 b"\x00\x00\x00\x00\x00",
1126 ]
1127 for internal, uni in ok:
1128 if sys.byteorder == "little":
1129 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001130 with support.check_warnings():
1131 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001132 for internal in not_ok:
1133 if sys.byteorder == "little":
1134 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001135 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001136 'deprecated', DeprecationWarning)):
1137 self.assertRaises(UnicodeDecodeError, internal.decode,
1138 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001139 if sys.byteorder == "little":
1140 invalid = b"\x00\x00\x11\x00"
1141 else:
1142 invalid = b"\x00\x11\x00\x00"
1143 with support.check_warnings():
1144 self.assertRaises(UnicodeDecodeError,
1145 invalid.decode, "unicode_internal")
1146 with support.check_warnings():
1147 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1148 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001149
Victor Stinner182d90d2011-09-29 19:53:55 +02001150 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001151 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001152 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001153 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001154 'deprecated', DeprecationWarning)):
1155 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001156 except UnicodeDecodeError as ex:
1157 self.assertEqual("unicode_internal", ex.encoding)
1158 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1159 self.assertEqual(4, ex.start)
1160 self.assertEqual(8, ex.end)
1161 else:
1162 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001163
Victor Stinner182d90d2011-09-29 19:53:55 +02001164 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001165 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001166 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1167 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001168 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001169 'deprecated', DeprecationWarning)):
1170 ab = "ab".encode("unicode_internal").decode()
1171 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1172 "ascii"),
1173 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001174 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001175
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001176 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001177 with support.check_warnings(('unicode_internal codec has been '
1178 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001179 # Issue 3739
1180 encoder = codecs.getencoder("unicode_internal")
1181 self.assertEqual(encoder("a")[1], 1)
1182 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1183
1184 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001185
Martin v. Löwis2548c732003-04-18 10:39:54 +00001186# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1187nameprep_tests = [
1188 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001189 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1190 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1191 b'\xb8\x8f\xef\xbb\xbf',
1192 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001194 (b'CAFE',
1195 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196 # 3.3 Case folding 8bit U+00DF (german sharp s).
1197 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001198 (b'\xc3\x9f',
1199 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001200 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001201 (b'\xc4\xb0',
1202 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001203 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001204 (b'\xc5\x83\xcd\xba',
1205 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001206 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1207 # XXX: skip this as it fails in UCS-2 mode
1208 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1209 # 'telc\xe2\x88\x95kg\xcf\x83'),
1210 (None, None),
1211 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001212 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1213 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001215 (b'\xe1\xbe\xb7',
1216 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217 # 3.9 Self-reverting case folding U+01F0 and normalization.
1218 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001219 (b'\xc7\xb0',
1220 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001221 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001222 (b'\xce\x90',
1223 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001225 (b'\xce\xb0',
1226 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001228 (b'\xe1\xba\x96',
1229 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001231 (b'\xe1\xbd\x96',
1232 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001234 (b' ',
1235 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001237 (b'\xc2\xa0',
1238 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001240 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241 None),
1242 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001243 (b'\xe2\x80\x80',
1244 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001246 (b'\xe2\x80\x8b',
1247 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001248 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001249 (b'\xe3\x80\x80',
1250 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001252 (b'\x10\x7f',
1253 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001255 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256 None),
1257 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001258 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259 None),
1260 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001261 (b'\xef\xbb\xbf',
1262 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001264 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265 None),
1266 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001267 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001268 None),
1269 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001270 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271 None),
1272 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001273 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 None),
1275 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001276 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001277 None),
1278 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001279 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280 None),
1281 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001282 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283 None),
1284 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286 None),
1287 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289 None),
1290 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001291 (b'\xcd\x81',
1292 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001294 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001295 None),
1296 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001297 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298 None),
1299 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001300 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301 None),
1302 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001303 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304 None),
1305 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001306 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307 None),
1308 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001309 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 None),
1311 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001312 (b'foo\xef\xb9\xb6bar',
1313 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316 None),
1317 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001318 (b'\xd8\xa71\xd8\xa8',
1319 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001321 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001322 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001323 # None),
1324 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 # 3.44 Larger test (shrinking).
1326 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001327 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1328 b'\xaa\xce\xb0\xe2\x80\x80',
1329 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001330 # 3.45 Larger test (expanding).
1331 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001332 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1333 b'\x80',
1334 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1335 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1336 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337 ]
1338
1339
1340class NameprepTest(unittest.TestCase):
1341 def test_nameprep(self):
1342 from encodings.idna import nameprep
1343 for pos, (orig, prepped) in enumerate(nameprep_tests):
1344 if orig is None:
1345 # Skipped
1346 continue
1347 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001348 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349 if prepped is None:
1350 # Input contains prohibited characters
1351 self.assertRaises(UnicodeError, nameprep, orig)
1352 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001353 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001355 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001356 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001357 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001359class IDNACodecTest(unittest.TestCase):
1360 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001361 self.assertEqual(str(b"python.org", "idna"), "python.org")
1362 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1363 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1364 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001365
1366 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001367 self.assertEqual("python.org".encode("idna"), b"python.org")
1368 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1369 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1370 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001371
Martin v. Löwis8b595142005-08-25 11:03:38 +00001372 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001373 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001374 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001375 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001376
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001377 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001378 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001379 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001380 "python.org"
1381 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001382 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001383 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001384 "python.org."
1385 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001386 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001387 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001388 "pyth\xf6n.org."
1389 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001390 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001391 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001392 "pyth\xf6n.org."
1393 )
1394
1395 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001396 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1397 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1398 self.assertEqual(decoder.decode(b"rg"), "")
1399 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001400
1401 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001402 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1403 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1404 self.assertEqual(decoder.decode(b"rg."), "org.")
1405 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001406
1407 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001408 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 b"".join(codecs.iterencode("python.org", "idna")),
1410 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001411 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001412 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 b"".join(codecs.iterencode("python.org.", "idna")),
1414 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001415 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001416 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1418 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001420 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001421 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1422 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423 )
1424
1425 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001426 self.assertEqual(encoder.encode("\xe4x"), b"")
1427 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1428 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001429
1430 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001431 self.assertEqual(encoder.encode("\xe4x"), b"")
1432 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1433 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001434
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001435class CodecsModuleTest(unittest.TestCase):
1436
1437 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001438 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1439 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001440 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001441 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001442 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001443
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001444 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001445 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1446 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001447 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001448 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001449 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001450 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001451
1452 def test_register(self):
1453 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001454 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001455
1456 def test_lookup(self):
1457 self.assertRaises(TypeError, codecs.lookup)
1458 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001459 self.assertRaises(LookupError, codecs.lookup, " ")
1460
1461 def test_getencoder(self):
1462 self.assertRaises(TypeError, codecs.getencoder)
1463 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1464
1465 def test_getdecoder(self):
1466 self.assertRaises(TypeError, codecs.getdecoder)
1467 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1468
1469 def test_getreader(self):
1470 self.assertRaises(TypeError, codecs.getreader)
1471 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1472
1473 def test_getwriter(self):
1474 self.assertRaises(TypeError, codecs.getwriter)
1475 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001476
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001477 def test_lookup_issue1813(self):
1478 # Issue #1813: under Turkish locales, lookup of some codecs failed
1479 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001480 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001481 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1482 try:
1483 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1484 except locale.Error:
1485 # Unsupported locale on this system
1486 self.skipTest('test needs Turkish locale')
1487 c = codecs.lookup('ASCII')
1488 self.assertEqual(c.name, 'ascii')
1489
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001490class StreamReaderTest(unittest.TestCase):
1491
1492 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001493 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001494 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001495
1496 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001497 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001498 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001499
Thomas Wouters89f507f2006-12-13 04:49:30 +00001500class EncodedFileTest(unittest.TestCase):
1501
1502 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001503 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001504 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001505 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001506
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001507 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001508 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001509 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001510 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001511
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001512all_unicode_encodings = [
1513 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001514 "big5",
1515 "big5hkscs",
1516 "charmap",
1517 "cp037",
1518 "cp1006",
1519 "cp1026",
1520 "cp1140",
1521 "cp1250",
1522 "cp1251",
1523 "cp1252",
1524 "cp1253",
1525 "cp1254",
1526 "cp1255",
1527 "cp1256",
1528 "cp1257",
1529 "cp1258",
1530 "cp424",
1531 "cp437",
1532 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001533 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001534 "cp737",
1535 "cp775",
1536 "cp850",
1537 "cp852",
1538 "cp855",
1539 "cp856",
1540 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001541 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001542 "cp860",
1543 "cp861",
1544 "cp862",
1545 "cp863",
1546 "cp864",
1547 "cp865",
1548 "cp866",
1549 "cp869",
1550 "cp874",
1551 "cp875",
1552 "cp932",
1553 "cp949",
1554 "cp950",
1555 "euc_jis_2004",
1556 "euc_jisx0213",
1557 "euc_jp",
1558 "euc_kr",
1559 "gb18030",
1560 "gb2312",
1561 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001562 "hp_roman8",
1563 "hz",
1564 "idna",
1565 "iso2022_jp",
1566 "iso2022_jp_1",
1567 "iso2022_jp_2",
1568 "iso2022_jp_2004",
1569 "iso2022_jp_3",
1570 "iso2022_jp_ext",
1571 "iso2022_kr",
1572 "iso8859_1",
1573 "iso8859_10",
1574 "iso8859_11",
1575 "iso8859_13",
1576 "iso8859_14",
1577 "iso8859_15",
1578 "iso8859_16",
1579 "iso8859_2",
1580 "iso8859_3",
1581 "iso8859_4",
1582 "iso8859_5",
1583 "iso8859_6",
1584 "iso8859_7",
1585 "iso8859_8",
1586 "iso8859_9",
1587 "johab",
1588 "koi8_r",
1589 "koi8_u",
1590 "latin_1",
1591 "mac_cyrillic",
1592 "mac_greek",
1593 "mac_iceland",
1594 "mac_latin2",
1595 "mac_roman",
1596 "mac_turkish",
1597 "palmos",
1598 "ptcp154",
1599 "punycode",
1600 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001601 "shift_jis",
1602 "shift_jis_2004",
1603 "shift_jisx0213",
1604 "tis_620",
1605 "unicode_escape",
1606 "unicode_internal",
1607 "utf_16",
1608 "utf_16_be",
1609 "utf_16_le",
1610 "utf_7",
1611 "utf_8",
1612]
1613
1614if hasattr(codecs, "mbcs_encode"):
1615 all_unicode_encodings.append("mbcs")
1616
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001617# The following encoding is not tested, because it's not supposed
1618# to work:
1619# "undefined"
1620
1621# The following encodings don't work in stateful mode
1622broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001623 "punycode",
1624 "unicode_internal"
1625]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001626broken_incremental_coders = broken_unicode_with_streams + [
1627 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001628]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001629
Walter Dörwald3abcb012007-04-16 22:10:50 +00001630class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001631 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001632 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001633 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001634 name = codecs.lookup(encoding).name
1635 if encoding.endswith("_codec"):
1636 name += "_codec"
1637 elif encoding == "latin_1":
1638 name = "latin_1"
1639 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001640
Ezio Melottiadc417c2011-11-17 12:23:34 +02001641 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001642 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001643 (b, size) = codecs.getencoder(encoding)(s)
1644 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1645 (chars, size) = codecs.getdecoder(encoding)(b)
1646 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001647
1648 if encoding not in broken_unicode_with_streams:
1649 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001650 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001651 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001652 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001653 for c in s:
1654 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001655 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001656 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001657 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001658 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001659 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001660 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001661 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001662 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001663 decodedresult += reader.read()
1664 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1665
Thomas Wouters89f507f2006-12-13 04:49:30 +00001666 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001667 # check incremental decoder/encoder (fetched via the Python
1668 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001669 try:
1670 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001671 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001672 except LookupError: # no IncrementalEncoder
1673 pass
1674 else:
1675 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001676 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001677 for c in s:
1678 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001679 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001680 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001681 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001682 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001683 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001684 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001685 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1686
1687 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001688 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001689 for c in s:
1690 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001691 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001692 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001693 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001694 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001695 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001696 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001697 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1698
1699 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001700 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001701 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1702
1703 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001704 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1705 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001706
Victor Stinner554f3f02010-06-16 23:33:54 +00001707 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001708 # check incremental decoder/encoder with errors argument
1709 try:
1710 encoder = codecs.getincrementalencoder(encoding)("ignore")
1711 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1712 except LookupError: # no IncrementalEncoder
1713 pass
1714 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001715 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001716 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001717 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001718 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1719
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001720 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001721 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001722 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001723 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1724
Walter Dörwald729c31f2005-03-14 19:06:30 +00001725 def test_seek(self):
1726 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001727 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001728 for encoding in all_unicode_encodings:
1729 if encoding == "idna": # FIXME: See SF bug #1163178
1730 continue
1731 if encoding in broken_unicode_with_streams:
1732 continue
Victor Stinner05010702011-05-27 16:50:40 +02001733 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001734 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001735 # Test that calling seek resets the internal codec state and buffers
1736 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001737 data = reader.read()
1738 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001739
Walter Dörwalde22d3392005-11-17 08:52:34 +00001740 def test_bad_decode_args(self):
1741 for encoding in all_unicode_encodings:
1742 decoder = codecs.getdecoder(encoding)
1743 self.assertRaises(TypeError, decoder)
1744 if encoding not in ("idna", "punycode"):
1745 self.assertRaises(TypeError, decoder, 42)
1746
1747 def test_bad_encode_args(self):
1748 for encoding in all_unicode_encodings:
1749 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001750 with support.check_warnings():
1751 # unicode-internal has been deprecated
1752 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001753
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001754 def test_encoding_map_type_initialized(self):
1755 from encodings import cp1140
1756 # This used to crash, we are only verifying there's no crash.
1757 table_type = type(cp1140.encoding_table)
1758 self.assertEqual(table_type, table_type)
1759
Walter Dörwald3abcb012007-04-16 22:10:50 +00001760 def test_decoder_state(self):
1761 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001762 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001763 for encoding in all_unicode_encodings:
1764 if encoding not in broken_incremental_coders:
1765 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1766 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1767
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001768class CharmapTest(unittest.TestCase):
1769 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001770 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001771 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001772 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001773 )
1774
Ezio Melottib3aedd42010-11-20 19:04:17 +00001775 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001776 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1777 ("\U0010FFFFbc", 3)
1778 )
1779
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001780 self.assertRaises(UnicodeDecodeError,
1781 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1782 )
1783
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001784 self.assertRaises(UnicodeDecodeError,
1785 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1786 )
1787
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001788 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001789 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001790 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001791 )
1792
Ezio Melottib3aedd42010-11-20 19:04:17 +00001793 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001794 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001795 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001796 )
1797
Ezio Melottib3aedd42010-11-20 19:04:17 +00001798 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001799 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001800 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001801 )
1802
Ezio Melottib3aedd42010-11-20 19:04:17 +00001803 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001804 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001805 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001806 )
1807
Guido van Rossum805365e2007-05-07 22:24:25 +00001808 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001809 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001810 codecs.charmap_decode(allbytes, "ignore", ""),
1811 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001812 )
1813
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001814 def test_decode_with_int2str_map(self):
1815 self.assertEqual(
1816 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1817 {0: 'a', 1: 'b', 2: 'c'}),
1818 ("abc", 3)
1819 )
1820
1821 self.assertEqual(
1822 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1823 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1824 ("AaBbCc", 3)
1825 )
1826
1827 self.assertEqual(
1828 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1829 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1830 ("\U0010FFFFbc", 3)
1831 )
1832
1833 self.assertEqual(
1834 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1835 {0: 'a', 1: 'b', 2: ''}),
1836 ("ab", 3)
1837 )
1838
1839 self.assertRaises(UnicodeDecodeError,
1840 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1841 {0: 'a', 1: 'b'}
1842 )
1843
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001844 self.assertRaises(UnicodeDecodeError,
1845 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1846 {0: 'a', 1: 'b', 2: None}
1847 )
1848
1849 # Issue #14850
1850 self.assertRaises(UnicodeDecodeError,
1851 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1852 {0: 'a', 1: 'b', 2: '\ufffe'}
1853 )
1854
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001855 self.assertEqual(
1856 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1857 {0: 'a', 1: 'b'}),
1858 ("ab\ufffd", 3)
1859 )
1860
1861 self.assertEqual(
1862 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1863 {0: 'a', 1: 'b', 2: None}),
1864 ("ab\ufffd", 3)
1865 )
1866
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001867 # Issue #14850
1868 self.assertEqual(
1869 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1870 {0: 'a', 1: 'b', 2: '\ufffe'}),
1871 ("ab\ufffd", 3)
1872 )
1873
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001874 self.assertEqual(
1875 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1876 {0: 'a', 1: 'b'}),
1877 ("ab", 3)
1878 )
1879
1880 self.assertEqual(
1881 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1882 {0: 'a', 1: 'b', 2: None}),
1883 ("ab", 3)
1884 )
1885
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001886 # Issue #14850
1887 self.assertEqual(
1888 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1889 {0: 'a', 1: 'b', 2: '\ufffe'}),
1890 ("ab", 3)
1891 )
1892
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001893 allbytes = bytes(range(256))
1894 self.assertEqual(
1895 codecs.charmap_decode(allbytes, "ignore", {}),
1896 ("", len(allbytes))
1897 )
1898
1899 def test_decode_with_int2int_map(self):
1900 a = ord('a')
1901 b = ord('b')
1902 c = ord('c')
1903
1904 self.assertEqual(
1905 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1906 {0: a, 1: b, 2: c}),
1907 ("abc", 3)
1908 )
1909
1910 # Issue #15379
1911 self.assertEqual(
1912 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1913 {0: 0x10FFFF, 1: b, 2: c}),
1914 ("\U0010FFFFbc", 3)
1915 )
1916
Antoine Pitroua1f76552012-09-23 20:00:04 +02001917 self.assertEqual(
1918 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1919 {0: sys.maxunicode, 1: b, 2: c}),
1920 (chr(sys.maxunicode) + "bc", 3)
1921 )
1922
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001923 self.assertRaises(TypeError,
1924 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001925 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001926 )
1927
1928 self.assertRaises(UnicodeDecodeError,
1929 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1930 {0: a, 1: b},
1931 )
1932
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001933 self.assertRaises(UnicodeDecodeError,
1934 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1935 {0: a, 1: b, 2: 0xFFFE},
1936 )
1937
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001938 self.assertEqual(
1939 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1940 {0: a, 1: b}),
1941 ("ab\ufffd", 3)
1942 )
1943
1944 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001945 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1946 {0: a, 1: b, 2: 0xFFFE}),
1947 ("ab\ufffd", 3)
1948 )
1949
1950 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001951 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1952 {0: a, 1: b}),
1953 ("ab", 3)
1954 )
1955
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001956 self.assertEqual(
1957 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1958 {0: a, 1: b, 2: 0xFFFE}),
1959 ("ab", 3)
1960 )
1961
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001962
Thomas Wouters89f507f2006-12-13 04:49:30 +00001963class WithStmtTest(unittest.TestCase):
1964 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001965 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001966 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1967 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001968
1969 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001970 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001971 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001972 with codecs.StreamReaderWriter(f, info.streamreader,
1973 info.streamwriter, 'strict') as srw:
1974 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001975
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001976class TypesTest(unittest.TestCase):
1977 def test_decode_unicode(self):
1978 # Most decoders don't accept unicode input
1979 decoders = [
1980 codecs.utf_7_decode,
1981 codecs.utf_8_decode,
1982 codecs.utf_16_le_decode,
1983 codecs.utf_16_be_decode,
1984 codecs.utf_16_ex_decode,
1985 codecs.utf_32_decode,
1986 codecs.utf_32_le_decode,
1987 codecs.utf_32_be_decode,
1988 codecs.utf_32_ex_decode,
1989 codecs.latin_1_decode,
1990 codecs.ascii_decode,
1991 codecs.charmap_decode,
1992 ]
1993 if hasattr(codecs, "mbcs_decode"):
1994 decoders.append(codecs.mbcs_decode)
1995 for decoder in decoders:
1996 self.assertRaises(TypeError, decoder, "xxx")
1997
1998 def test_unicode_escape(self):
1999 # Escape-decoding an unicode string is supported ang gives the same
2000 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002001 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2002 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2003 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2004 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002005
Victor Stinnere3b47152011-12-09 20:49:49 +01002006 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2007 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2008
2009 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2010 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2011
Martin v. Löwis43c57782009-05-10 08:15:24 +00002012class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002013
2014 def test_utf8(self):
2015 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002016 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002017 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002018 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002019 b"foo\x80bar")
2020 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002021 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002022 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002023 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002024 b"\xed\xb0\x80")
2025
2026 def test_ascii(self):
2027 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002028 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002029 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002030 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002031 b"foo\x80bar")
2032
2033 def test_charmap(self):
2034 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002035 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002036 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002037 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002038 b"foo\xa5bar")
2039
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002040 def test_latin1(self):
2041 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002042 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002043 b"\xe4\xeb\xef\xf6\xfc")
2044
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002045
Victor Stinner3fed0872010-05-22 02:16:27 +00002046class BomTest(unittest.TestCase):
2047 def test_seek0(self):
2048 data = "1234567890"
2049 tests = ("utf-16",
2050 "utf-16-le",
2051 "utf-16-be",
2052 "utf-32",
2053 "utf-32-le",
2054 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002055 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002056 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002057 # Check if the BOM is written only once
2058 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002059 f.write(data)
2060 f.write(data)
2061 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002062 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002063 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002064 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002065
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002066 # Check that the BOM is written after a seek(0)
2067 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2068 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002069 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002070 f.seek(0)
2071 f.write(data)
2072 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002073 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002074
2075 # (StreamWriter) Check that the BOM is written after a seek(0)
2076 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002077 f.writer.write(data[0])
2078 self.assertNotEqual(f.writer.tell(), 0)
2079 f.writer.seek(0)
2080 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002081 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002082 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002083
Victor Stinner05010702011-05-27 16:50:40 +02002084 # Check that the BOM is not written after a seek() at a position
2085 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002086 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2087 f.write(data)
2088 f.seek(f.tell())
2089 f.write(data)
2090 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002091 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002092
Victor Stinner05010702011-05-27 16:50:40 +02002093 # (StreamWriter) Check that the BOM is not written after a seek()
2094 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002095 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002096 f.writer.write(data)
2097 f.writer.seek(f.writer.tell())
2098 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002099 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002100 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002101
Victor Stinner3fed0872010-05-22 02:16:27 +00002102
Georg Brandl02524622010-12-02 18:06:51 +00002103bytes_transform_encodings = [
2104 "base64_codec",
2105 "uu_codec",
2106 "quopri_codec",
2107 "hex_codec",
2108]
2109try:
2110 import zlib
2111except ImportError:
2112 pass
2113else:
2114 bytes_transform_encodings.append("zlib_codec")
2115try:
2116 import bz2
2117except ImportError:
2118 pass
2119else:
2120 bytes_transform_encodings.append("bz2_codec")
2121
2122class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002123
Georg Brandl02524622010-12-02 18:06:51 +00002124 def test_basics(self):
2125 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002126 for encoding in bytes_transform_encodings:
2127 # generic codecs interface
2128 (o, size) = codecs.getencoder(encoding)(binput)
2129 self.assertEqual(size, len(binput))
2130 (i, size) = codecs.getdecoder(encoding)(o)
2131 self.assertEqual(size, len(o))
2132 self.assertEqual(i, binput)
2133
Georg Brandl02524622010-12-02 18:06:51 +00002134 def test_read(self):
2135 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002136 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002137 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002138 sout = reader.read()
2139 self.assertEqual(sout, b"\x80")
2140
2141 def test_readline(self):
2142 for encoding in bytes_transform_encodings:
2143 if encoding in ['uu_codec', 'zlib_codec']:
2144 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002145 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002146 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002147 sout = reader.readline()
2148 self.assertEqual(sout, b"\x80")
2149
2150
Victor Stinner62be4fb2011-10-18 21:46:37 +02002151@unittest.skipUnless(sys.platform == 'win32',
2152 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002153class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002154 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002155 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002156
Victor Stinner3a50e702011-10-18 21:21:00 +02002157 def test_invalid_code_page(self):
2158 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2159 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2160 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2161 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2162
2163 def test_code_page_name(self):
2164 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2165 codecs.code_page_encode, 932, '\xff')
2166 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2167 codecs.code_page_decode, 932, b'\x81\x00')
2168 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2169 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2170
2171 def check_decode(self, cp, tests):
2172 for raw, errors, expected in tests:
2173 if expected is not None:
2174 try:
2175 decoded = codecs.code_page_decode(cp, raw, errors)
2176 except UnicodeDecodeError as err:
2177 self.fail('Unable to decode %a from "cp%s" with '
2178 'errors=%r: %s' % (raw, cp, errors, err))
2179 self.assertEqual(decoded[0], expected,
2180 '%a.decode("cp%s", %r)=%a != %a'
2181 % (raw, cp, errors, decoded[0], expected))
2182 # assert 0 <= decoded[1] <= len(raw)
2183 self.assertGreaterEqual(decoded[1], 0)
2184 self.assertLessEqual(decoded[1], len(raw))
2185 else:
2186 self.assertRaises(UnicodeDecodeError,
2187 codecs.code_page_decode, cp, raw, errors)
2188
2189 def check_encode(self, cp, tests):
2190 for text, errors, expected in tests:
2191 if expected is not None:
2192 try:
2193 encoded = codecs.code_page_encode(cp, text, errors)
2194 except UnicodeEncodeError as err:
2195 self.fail('Unable to encode %a to "cp%s" with '
2196 'errors=%r: %s' % (text, cp, errors, err))
2197 self.assertEqual(encoded[0], expected,
2198 '%a.encode("cp%s", %r)=%a != %a'
2199 % (text, cp, errors, encoded[0], expected))
2200 self.assertEqual(encoded[1], len(text))
2201 else:
2202 self.assertRaises(UnicodeEncodeError,
2203 codecs.code_page_encode, cp, text, errors)
2204
2205 def test_cp932(self):
2206 self.check_encode(932, (
2207 ('abc', 'strict', b'abc'),
2208 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002209 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002210 ('\xff', 'strict', None),
2211 ('[\xff]', 'ignore', b'[]'),
2212 ('[\xff]', 'replace', b'[y]'),
2213 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002214 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2215 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002216 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002217 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002218 (b'abc', 'strict', 'abc'),
2219 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2220 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002221 (b'[\xff]', 'strict', None),
2222 (b'[\xff]', 'ignore', '[]'),
2223 (b'[\xff]', 'replace', '[\ufffd]'),
2224 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002225 (b'\x81\x00abc', 'strict', None),
2226 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002227 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2228 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002229
2230 def test_cp1252(self):
2231 self.check_encode(1252, (
2232 ('abc', 'strict', b'abc'),
2233 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2234 ('\xff', 'strict', b'\xff'),
2235 ('\u0141', 'strict', None),
2236 ('\u0141', 'ignore', b''),
2237 ('\u0141', 'replace', b'L'),
2238 ))
2239 self.check_decode(1252, (
2240 (b'abc', 'strict', 'abc'),
2241 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2242 (b'\xff', 'strict', '\xff'),
2243 ))
2244
2245 def test_cp_utf7(self):
2246 cp = 65000
2247 self.check_encode(cp, (
2248 ('abc', 'strict', b'abc'),
2249 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2250 ('\U0010ffff', 'strict', b'+2//f/w-'),
2251 ('\udc80', 'strict', b'+3IA-'),
2252 ('\ufffd', 'strict', b'+//0-'),
2253 ))
2254 self.check_decode(cp, (
2255 (b'abc', 'strict', 'abc'),
2256 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2257 (b'+2//f/w-', 'strict', '\U0010ffff'),
2258 (b'+3IA-', 'strict', '\udc80'),
2259 (b'+//0-', 'strict', '\ufffd'),
2260 # invalid bytes
2261 (b'[+/]', 'strict', '[]'),
2262 (b'[\xff]', 'strict', '[\xff]'),
2263 ))
2264
Victor Stinner3a50e702011-10-18 21:21:00 +02002265 def test_multibyte_encoding(self):
2266 self.check_decode(932, (
2267 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2268 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2269 ))
2270 self.check_decode(self.CP_UTF8, (
2271 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2272 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2273 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002274 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002275 self.check_encode(self.CP_UTF8, (
2276 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2277 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2278 ))
2279
2280 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002281 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2282 self.assertEqual(decoded, ('', 0))
2283
Victor Stinner3a50e702011-10-18 21:21:00 +02002284 decoded = codecs.code_page_decode(932,
2285 b'\xe9\x80\xe9', 'strict',
2286 False)
2287 self.assertEqual(decoded, ('\u9a3e', 2))
2288
2289 decoded = codecs.code_page_decode(932,
2290 b'\xe9\x80\xe9\x80', 'strict',
2291 False)
2292 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2293
2294 decoded = codecs.code_page_decode(932,
2295 b'abc', 'strict',
2296 False)
2297 self.assertEqual(decoded, ('abc', 3))
2298
2299
Fred Drake2e2be372001-09-20 21:33:42 +00002300if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002301 unittest.main()