blob: 235a91a6230f35607a2ef3a2324dffa9432c5289 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10003import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01004import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02005import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01006import sys
7import unittest
8import warnings
9
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
150 # Test long lines (multiple calls to read() in readline())
151 vw = []
152 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000153 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
154 vw.append((i*200)*"\3042" + lineend)
155 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000156 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
157 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
158
159 # Test lines where the first read might end with \r, so the
160 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000161 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 for lineend in "\n \r\n \r \u2028".split():
163 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000171 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 self.assertEqual(
173 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000174 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 )
176
177 def test_bug1175396(self):
178 s = [
179 '<%!--===================================================\r\n',
180 ' BLOG index page: show recent articles,\r\n',
181 ' today\'s articles, or articles of a specific date.\r\n',
182 '========================================================--%>\r\n',
183 '<%@inputencoding="ISO-8859-1"%>\r\n',
184 '<%@pagetemplate=TEMPLATE.y%>\r\n',
185 '<%@import=import frog.util, frog%>\r\n',
186 '<%@import=import frog.objects%>\r\n',
187 '<%@import=from frog.storageerrors import StorageError%>\r\n',
188 '<%\r\n',
189 '\r\n',
190 'import logging\r\n',
191 'log=logging.getLogger("Snakelets.logger")\r\n',
192 '\r\n',
193 '\r\n',
194 'user=self.SessionCtx.user\r\n',
195 'storageEngine=self.SessionCtx.storageEngine\r\n',
196 '\r\n',
197 '\r\n',
198 'def readArticlesFromDate(date, count=None):\r\n',
199 ' entryids=storageEngine.listBlogEntries(date)\r\n',
200 ' entryids.reverse() # descending\r\n',
201 ' if count:\r\n',
202 ' entryids=entryids[:count]\r\n',
203 ' try:\r\n',
204 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
205 ' except StorageError,x:\r\n',
206 ' log.error("Error loading articles: "+str(x))\r\n',
207 ' self.abort("cannot load articles")\r\n',
208 '\r\n',
209 'showdate=None\r\n',
210 '\r\n',
211 'arg=self.Request.getArg()\r\n',
212 'if arg=="today":\r\n',
213 ' #-------------------- TODAY\'S ARTICLES\r\n',
214 ' self.write("<h2>Today\'s articles</h2>")\r\n',
215 ' showdate = frog.util.isodatestr() \r\n',
216 ' entries = readArticlesFromDate(showdate)\r\n',
217 'elif arg=="active":\r\n',
218 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
219 ' self.Yredirect("active.y")\r\n',
220 'elif arg=="login":\r\n',
221 ' #-------------------- LOGIN PAGE redirect\r\n',
222 ' self.Yredirect("login.y")\r\n',
223 'elif arg=="date":\r\n',
224 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
225 ' showdate = self.Request.getParameter("date")\r\n',
226 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
227 ' entries = readArticlesFromDate(showdate)\r\n',
228 'else:\r\n',
229 ' #-------------------- RECENT ARTICLES\r\n',
230 ' self.write("<h2>Recent articles</h2>")\r\n',
231 ' dates=storageEngine.listBlogEntryDates()\r\n',
232 ' if dates:\r\n',
233 ' entries=[]\r\n',
234 ' SHOWAMOUNT=10\r\n',
235 ' for showdate in dates:\r\n',
236 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
237 ' if len(entries)>=SHOWAMOUNT:\r\n',
238 ' break\r\n',
239 ' \r\n',
240 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000241 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200242 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000243 for (i, line) in enumerate(reader):
244 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000245
246 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000247 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200248 writer = codecs.getwriter(self.encoding)(q)
249 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000250
251 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000252 writer.write("foo\r")
253 self.assertEqual(reader.readline(keepends=False), "foo")
254 writer.write("\nbar\r")
255 self.assertEqual(reader.readline(keepends=False), "")
256 self.assertEqual(reader.readline(keepends=False), "bar")
257 writer.write("baz")
258 self.assertEqual(reader.readline(keepends=False), "baz")
259 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000260
261 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000262 writer.write("foo\r")
263 self.assertEqual(reader.readline(keepends=True), "foo\r")
264 writer.write("\nbar\r")
265 self.assertEqual(reader.readline(keepends=True), "\n")
266 self.assertEqual(reader.readline(keepends=True), "bar\r")
267 writer.write("baz")
268 self.assertEqual(reader.readline(keepends=True), "baz")
269 self.assertEqual(reader.readline(keepends=True), "")
270 writer.write("foo\r\n")
271 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000272
Walter Dörwald9fa09462005-01-10 12:01:39 +0000273 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000274 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
275 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
276 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000277
278 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000279 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200280 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000281 self.assertEqual(reader.readline(), s1)
282 self.assertEqual(reader.readline(), s2)
283 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000284 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000285
286 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000287 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
288 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
289 s3 = "stillokay:bbbbxx\r\n"
290 s4 = "broken!!!!badbad\r\n"
291 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000292
293 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000294 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200295 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296 self.assertEqual(reader.readline(), s1)
297 self.assertEqual(reader.readline(), s2)
298 self.assertEqual(reader.readline(), s3)
299 self.assertEqual(reader.readline(), s4)
300 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000301 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000302
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200303class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304 encoding = "utf-32"
305
306 spamle = (b'\xff\xfe\x00\x00'
307 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
308 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
309 spambe = (b'\x00\x00\xfe\xff'
310 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
311 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
312
313 def test_only_one_bom(self):
314 _,_,reader,writer = codecs.lookup(self.encoding)
315 # encode some stream
316 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200317 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000318 f.write("spam")
319 f.write("spam")
320 d = s.getvalue()
321 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000322 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000323 # try to read it back
324 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200325 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000326 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000327
328 def test_badbom(self):
329 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200330 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000331 self.assertRaises(UnicodeError, f.read)
332
333 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200334 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000335 self.assertRaises(UnicodeError, f.read)
336
337 def test_partial(self):
338 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200339 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000340 [
341 "", # first byte of BOM read
342 "", # second byte of BOM read
343 "", # third byte of BOM read
344 "", # fourth byte of BOM read => byteorder known
345 "",
346 "",
347 "",
348 "\x00",
349 "\x00",
350 "\x00",
351 "\x00",
352 "\x00\xff",
353 "\x00\xff",
354 "\x00\xff",
355 "\x00\xff",
356 "\x00\xff\u0100",
357 "\x00\xff\u0100",
358 "\x00\xff\u0100",
359 "\x00\xff\u0100",
360 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200361 "\x00\xff\u0100\uffff",
362 "\x00\xff\u0100\uffff",
363 "\x00\xff\u0100\uffff",
364 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000365 ]
366 )
367
Georg Brandl791f4e12009-09-17 11:41:24 +0000368 def test_handlers(self):
369 self.assertEqual(('\ufffd', 1),
370 codecs.utf_32_decode(b'\x01', 'replace', True))
371 self.assertEqual(('', 1),
372 codecs.utf_32_decode(b'\x01', 'ignore', True))
373
Walter Dörwald41980ca2007-08-16 21:55:45 +0000374 def test_errors(self):
375 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
376 b"\xff", "strict", True)
377
378 def test_decoder_state(self):
379 self.check_state_handling_decode(self.encoding,
380 "spamspam", self.spamle)
381 self.check_state_handling_decode(self.encoding,
382 "spamspam", self.spambe)
383
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000384 def test_issue8941(self):
385 # Issue #8941: insufficient result allocation when decoding into
386 # surrogate pairs on UCS-2 builds.
387 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
388 self.assertEqual('\U00010000' * 1024,
389 codecs.utf_32_decode(encoded_le)[0])
390 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
391 self.assertEqual('\U00010000' * 1024,
392 codecs.utf_32_decode(encoded_be)[0])
393
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200394class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000395 encoding = "utf-32-le"
396
397 def test_partial(self):
398 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200399 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000400 [
401 "",
402 "",
403 "",
404 "\x00",
405 "\x00",
406 "\x00",
407 "\x00",
408 "\x00\xff",
409 "\x00\xff",
410 "\x00\xff",
411 "\x00\xff",
412 "\x00\xff\u0100",
413 "\x00\xff\u0100",
414 "\x00\xff\u0100",
415 "\x00\xff\u0100",
416 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200417 "\x00\xff\u0100\uffff",
418 "\x00\xff\u0100\uffff",
419 "\x00\xff\u0100\uffff",
420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 ]
422 )
423
424 def test_simple(self):
425 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
426
427 def test_errors(self):
428 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
429 b"\xff", "strict", True)
430
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000431 def test_issue8941(self):
432 # Issue #8941: insufficient result allocation when decoding into
433 # surrogate pairs on UCS-2 builds.
434 encoded = b'\x00\x00\x01\x00' * 1024
435 self.assertEqual('\U00010000' * 1024,
436 codecs.utf_32_le_decode(encoded)[0])
437
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200438class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000439 encoding = "utf-32-be"
440
441 def test_partial(self):
442 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200443 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000444 [
445 "",
446 "",
447 "",
448 "\x00",
449 "\x00",
450 "\x00",
451 "\x00",
452 "\x00\xff",
453 "\x00\xff",
454 "\x00\xff",
455 "\x00\xff",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100",
458 "\x00\xff\u0100",
459 "\x00\xff\u0100",
460 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200461 "\x00\xff\u0100\uffff",
462 "\x00\xff\u0100\uffff",
463 "\x00\xff\u0100\uffff",
464 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 ]
466 )
467
468 def test_simple(self):
469 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
470
471 def test_errors(self):
472 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
473 b"\xff", "strict", True)
474
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000475 def test_issue8941(self):
476 # Issue #8941: insufficient result allocation when decoding into
477 # surrogate pairs on UCS-2 builds.
478 encoded = b'\x00\x01\x00\x00' * 1024
479 self.assertEqual('\U00010000' * 1024,
480 codecs.utf_32_be_decode(encoded)[0])
481
482
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200483class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000484 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000486 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
487 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000488
489 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000490 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000491 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000492 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200493 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000494 f.write("spam")
495 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000496 d = s.getvalue()
497 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000498 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000499 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000500 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200501 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000502 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000503
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000504 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000505 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200506 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000507 self.assertRaises(UnicodeError, f.read)
508
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000509 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200510 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000511 self.assertRaises(UnicodeError, f.read)
512
Walter Dörwald69652032004-09-07 20:24:22 +0000513 def test_partial(self):
514 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200515 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000516 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000517 "", # first byte of BOM read
518 "", # second byte of BOM read => byteorder known
519 "",
520 "\x00",
521 "\x00",
522 "\x00\xff",
523 "\x00\xff",
524 "\x00\xff\u0100",
525 "\x00\xff\u0100",
526 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200527 "\x00\xff\u0100\uffff",
528 "\x00\xff\u0100\uffff",
529 "\x00\xff\u0100\uffff",
530 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000531 ]
532 )
533
Georg Brandl791f4e12009-09-17 11:41:24 +0000534 def test_handlers(self):
535 self.assertEqual(('\ufffd', 1),
536 codecs.utf_16_decode(b'\x01', 'replace', True))
537 self.assertEqual(('', 1),
538 codecs.utf_16_decode(b'\x01', 'ignore', True))
539
Walter Dörwalde22d3392005-11-17 08:52:34 +0000540 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000541 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000542 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000543
544 def test_decoder_state(self):
545 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000547 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000549
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000550 def test_bug691291(self):
551 # Files are always opened in binary mode, even if no binary mode was
552 # specified. This means that no automatic conversion of '\n' is done
553 # on reading and writing.
554 s1 = 'Hello\r\nworld\r\n'
555
556 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200557 self.addCleanup(support.unlink, support.TESTFN)
558 with open(support.TESTFN, 'wb') as fp:
559 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200560 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200561 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000562
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200563class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000564 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000565
566 def test_partial(self):
567 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200568 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000569 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000570 "",
571 "\x00",
572 "\x00",
573 "\x00\xff",
574 "\x00\xff",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100",
577 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff",
580 "\x00\xff\u0100\uffff",
581 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000582 ]
583 )
584
Walter Dörwalde22d3392005-11-17 08:52:34 +0000585 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200586 tests = [
587 (b'\xff', '\ufffd'),
588 (b'A\x00Z', 'A\ufffd'),
589 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
590 (b'\x00\xd8', '\ufffd'),
591 (b'\x00\xd8A', '\ufffd'),
592 (b'\x00\xd8A\x00', '\ufffdA'),
593 (b'\x00\xdcA\x00', '\ufffdA'),
594 ]
595 for raw, expected in tests:
596 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
597 raw, 'strict', True)
598 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000599
Victor Stinner53a9dd72010-12-08 22:25:45 +0000600 def test_nonbmp(self):
601 self.assertEqual("\U00010203".encode(self.encoding),
602 b'\x00\xd8\x03\xde')
603 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
604 "\U00010203")
605
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200606class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000607 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000608
609 def test_partial(self):
610 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200611 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000612 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000613 "",
614 "\x00",
615 "\x00",
616 "\x00\xff",
617 "\x00\xff",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100",
620 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff",
624 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000625 ]
626 )
627
Walter Dörwalde22d3392005-11-17 08:52:34 +0000628 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200629 tests = [
630 (b'\xff', '\ufffd'),
631 (b'\x00A\xff', 'A\ufffd'),
632 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
633 (b'\xd8\x00', '\ufffd'),
634 (b'\xd8\x00\xdc', '\ufffd'),
635 (b'\xd8\x00\x00A', '\ufffdA'),
636 (b'\xdc\x00\x00A', '\ufffdA'),
637 ]
638 for raw, expected in tests:
639 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
640 raw, 'strict', True)
641 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642
Victor Stinner53a9dd72010-12-08 22:25:45 +0000643 def test_nonbmp(self):
644 self.assertEqual("\U00010203".encode(self.encoding),
645 b'\xd8\x00\xde\x03')
646 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
647 "\U00010203")
648
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200649class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000650 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000651
652 def test_partial(self):
653 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200654 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000655 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000656 "\x00",
657 "\x00",
658 "\x00\xff",
659 "\x00\xff",
660 "\x00\xff\u07ff",
661 "\x00\xff\u07ff",
662 "\x00\xff\u07ff",
663 "\x00\xff\u07ff\u0800",
664 "\x00\xff\u07ff\u0800",
665 "\x00\xff\u07ff\u0800",
666 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200667 "\x00\xff\u07ff\u0800\uffff",
668 "\x00\xff\u07ff\u0800\uffff",
669 "\x00\xff\u07ff\u0800\uffff",
670 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 ]
672 )
673
Walter Dörwald3abcb012007-04-16 22:10:50 +0000674 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000675 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000676 self.check_state_handling_decode(self.encoding,
677 u, u.encode(self.encoding))
678
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000679 def test_lone_surrogates(self):
680 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
681 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000682 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
683 b'[\\udc80]')
684 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
685 b'[&#56448;]')
686 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
687 b'[\x80]')
688 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
689 b'[]')
690 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
691 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000692
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000693 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000694 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
695 b"abc\xed\xa0\x80def")
696 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
697 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200698 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
699 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
700 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
701 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000702 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700703 with self.assertRaises(UnicodeDecodeError):
704 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200705 with self.assertRaises(UnicodeDecodeError):
706 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000707
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200708@unittest.skipUnless(sys.platform == 'win32',
709 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200710class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200711 encoding = "cp65001"
712
713 def test_encode(self):
714 tests = [
715 ('abc', 'strict', b'abc'),
716 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
717 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
718 ]
719 if VISTA_OR_LATER:
720 tests.extend((
721 ('\udc80', 'strict', None),
722 ('\udc80', 'ignore', b''),
723 ('\udc80', 'replace', b'?'),
724 ('\udc80', 'backslashreplace', b'\\udc80'),
725 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
726 ))
727 else:
728 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
729 for text, errors, expected in tests:
730 if expected is not None:
731 try:
732 encoded = text.encode('cp65001', errors)
733 except UnicodeEncodeError as err:
734 self.fail('Unable to encode %a to cp65001 with '
735 'errors=%r: %s' % (text, errors, err))
736 self.assertEqual(encoded, expected,
737 '%a.encode("cp65001", %r)=%a != %a'
738 % (text, errors, encoded, expected))
739 else:
740 self.assertRaises(UnicodeEncodeError,
741 text.encode, "cp65001", errors)
742
743 def test_decode(self):
744 tests = [
745 (b'abc', 'strict', 'abc'),
746 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
747 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
748 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
749 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
750 # invalid bytes
751 (b'[\xff]', 'strict', None),
752 (b'[\xff]', 'ignore', '[]'),
753 (b'[\xff]', 'replace', '[\ufffd]'),
754 (b'[\xff]', 'surrogateescape', '[\udcff]'),
755 ]
756 if VISTA_OR_LATER:
757 tests.extend((
758 (b'[\xed\xb2\x80]', 'strict', None),
759 (b'[\xed\xb2\x80]', 'ignore', '[]'),
760 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
761 ))
762 else:
763 tests.extend((
764 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
765 ))
766 for raw, errors, expected in tests:
767 if expected is not None:
768 try:
769 decoded = raw.decode('cp65001', errors)
770 except UnicodeDecodeError as err:
771 self.fail('Unable to decode %a from cp65001 with '
772 'errors=%r: %s' % (raw, errors, err))
773 self.assertEqual(decoded, expected,
774 '%a.decode("cp65001", %r)=%a != %a'
775 % (raw, errors, decoded, expected))
776 else:
777 self.assertRaises(UnicodeDecodeError,
778 raw.decode, 'cp65001', errors)
779
780 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
781 def test_lone_surrogates(self):
782 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
783 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
784 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
785 b'[\\udc80]')
786 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
787 b'[&#56448;]')
788 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
789 b'[\x80]')
790 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
791 b'[]')
792 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
793 b'[?]')
794
795 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
796 def test_surrogatepass_handler(self):
797 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
798 b"abc\xed\xa0\x80def")
799 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
800 "abc\ud800def")
801 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
802 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
803 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
804 "\U00010fff\uD800")
805 self.assertTrue(codecs.lookup_error("surrogatepass"))
806
807
808
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200809class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000810 encoding = "utf-7"
811
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000812 def test_partial(self):
813 self.check_partial(
814 "a+-b",
815 [
816 "a",
817 "a",
818 "a+",
819 "a+-",
820 "a+-b",
821 ]
822 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000823
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300824 def test_errors(self):
825 tests = [
826 (b'a\xffb', 'a\ufffdb'),
827 (b'a+IK', 'a\ufffd'),
828 (b'a+IK-b', 'a\ufffdb'),
829 (b'a+IK,b', 'a\ufffdb'),
830 (b'a+IKx', 'a\u20ac\ufffd'),
831 (b'a+IKx-b', 'a\u20ac\ufffdb'),
832 (b'a+IKwgr', 'a\u20ac\ufffd'),
833 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
834 (b'a+IKwgr,', 'a\u20ac\ufffd'),
835 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
836 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
837 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
838 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
839 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
840 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
841 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
842 ]
843 for raw, expected in tests:
844 with self.subTest(raw=raw):
845 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
846 raw, 'strict', True)
847 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
848
849 def test_nonbmp(self):
850 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
851 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
852 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
853
Walter Dörwalde22d3392005-11-17 08:52:34 +0000854class UTF16ExTest(unittest.TestCase):
855
856 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000857 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000858
859 def test_bad_args(self):
860 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
861
862class ReadBufferTest(unittest.TestCase):
863
864 def test_array(self):
865 import array
866 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000867 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000868 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000869 )
870
871 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000872 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000873
874 def test_bad_args(self):
875 self.assertRaises(TypeError, codecs.readbuffer_encode)
876 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
877
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200878class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000879 encoding = "utf-8-sig"
880
881 def test_partial(self):
882 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200883 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000884 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000885 "",
886 "",
887 "", # First BOM has been read and skipped
888 "",
889 "",
890 "\ufeff", # Second BOM has been read and emitted
891 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000892 "\ufeff\x00", # First byte of encoded "\xff" read
893 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
894 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
895 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000896 "\ufeff\x00\xff\u07ff",
897 "\ufeff\x00\xff\u07ff",
898 "\ufeff\x00\xff\u07ff\u0800",
899 "\ufeff\x00\xff\u07ff\u0800",
900 "\ufeff\x00\xff\u07ff\u0800",
901 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200902 "\ufeff\x00\xff\u07ff\u0800\uffff",
903 "\ufeff\x00\xff\u07ff\u0800\uffff",
904 "\ufeff\x00\xff\u07ff\u0800\uffff",
905 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000906 ]
907 )
908
Thomas Wouters89f507f2006-12-13 04:49:30 +0000909 def test_bug1601501(self):
910 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000911 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000912
Walter Dörwald3abcb012007-04-16 22:10:50 +0000913 def test_bom(self):
914 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000915 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000916 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
917
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000918 def test_stream_bom(self):
919 unistring = "ABC\u00A1\u2200XYZ"
920 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
921
922 reader = codecs.getreader("utf-8-sig")
923 for sizehint in [None] + list(range(1, 11)) + \
924 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200925 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000926 ostream = io.StringIO()
927 while 1:
928 if sizehint is not None:
929 data = istream.read(sizehint)
930 else:
931 data = istream.read()
932
933 if not data:
934 break
935 ostream.write(data)
936
937 got = ostream.getvalue()
938 self.assertEqual(got, unistring)
939
940 def test_stream_bare(self):
941 unistring = "ABC\u00A1\u2200XYZ"
942 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
943
944 reader = codecs.getreader("utf-8-sig")
945 for sizehint in [None] + list(range(1, 11)) + \
946 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200947 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000948 ostream = io.StringIO()
949 while 1:
950 if sizehint is not None:
951 data = istream.read(sizehint)
952 else:
953 data = istream.read()
954
955 if not data:
956 break
957 ostream.write(data)
958
959 got = ostream.getvalue()
960 self.assertEqual(got, unistring)
961
962class EscapeDecodeTest(unittest.TestCase):
963 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200964 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000965
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200966 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200967 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200968 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200969 b = bytes([b])
970 if b != b'\\':
971 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200972
973 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200974 decode = codecs.escape_decode
975 check = coding_checker(self, decode)
976 check(b"[\\\n]", b"[]")
977 check(br'[\"]', b'["]')
978 check(br"[\']", b"[']")
979 check(br"[\\]", br"[\]")
980 check(br"[\a]", b"[\x07]")
981 check(br"[\b]", b"[\x08]")
982 check(br"[\t]", b"[\x09]")
983 check(br"[\n]", b"[\x0a]")
984 check(br"[\v]", b"[\x0b]")
985 check(br"[\f]", b"[\x0c]")
986 check(br"[\r]", b"[\x0d]")
987 check(br"[\7]", b"[\x07]")
988 check(br"[\8]", br"[\8]")
989 check(br"[\78]", b"[\x078]")
990 check(br"[\41]", b"[!]")
991 check(br"[\418]", b"[!8]")
992 check(br"[\101]", b"[A]")
993 check(br"[\1010]", b"[A0]")
994 check(br"[\501]", b"[A]")
995 check(br"[\x41]", b"[A]")
996 check(br"[\X41]", br"[\X41]")
997 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200998 for b in range(256):
999 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001000 b = bytes([b])
1001 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001002
1003 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001004 decode = codecs.escape_decode
1005 self.assertRaises(ValueError, decode, br"\x")
1006 self.assertRaises(ValueError, decode, br"[\x]")
1007 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1008 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1009 self.assertRaises(ValueError, decode, br"\x0")
1010 self.assertRaises(ValueError, decode, br"[\x0]")
1011 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1012 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001013
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001014class RecodingTest(unittest.TestCase):
1015 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001016 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001017 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001018 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001019 f2.close()
1020 # Python used to crash on this at exit because of a refcount
1021 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001022
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023# From RFC 3492
1024punycode_testcases = [
1025 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001026 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1027 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001028 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001030 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001031 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001033 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001034 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001036 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1037 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1038 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001039 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001040 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001041 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1042 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1043 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001044 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001046 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001047 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1048 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1049 "\u0939\u0948\u0902",
1050 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051
1052 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001053 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001054 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1055 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056
1057 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001058 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1059 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1060 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001061 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1062 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063
1064 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001065 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1066 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1067 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1068 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001069 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070
1071 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001072 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1073 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1074 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1075 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1076 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001077 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078
1079 # (K) Vietnamese:
1080 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1081 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001082 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1083 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1084 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1085 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001086 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001087
Martin v. Löwis2548c732003-04-18 10:39:54 +00001088 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001089 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001090 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001091
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001093 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1094 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1095 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001096 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001097
1098 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001099 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1100 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1101 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001102 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001103
1104 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001105 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001106 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001107
1108 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001109 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1110 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001111 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001112
1113 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001114 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001115 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116
1117 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001118 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001119 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120
1121 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001122 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1123 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001124 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125 ]
1126
1127for i in punycode_testcases:
1128 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001129 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001130
1131class PunycodeTest(unittest.TestCase):
1132 def test_encode(self):
1133 for uni, puny in punycode_testcases:
1134 # Need to convert both strings to lower case, since
1135 # some of the extended encodings use upper case, but our
1136 # code produces only lower case. Converting just puny to
1137 # lower is also insufficient, since some of the input characters
1138 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001139 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001140 str(uni.encode("punycode"), "ascii").lower(),
1141 str(puny, "ascii").lower()
1142 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143
1144 def test_decode(self):
1145 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001146 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001147 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001148 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001150class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001151 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001152 def test_bug1251300(self):
1153 # Decoding with unicode_internal used to not correctly handle "code
1154 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001155 ok = [
1156 (b"\x00\x10\xff\xff", "\U0010ffff"),
1157 (b"\x00\x00\x01\x01", "\U00000101"),
1158 (b"", ""),
1159 ]
1160 not_ok = [
1161 b"\x7f\xff\xff\xff",
1162 b"\x80\x00\x00\x00",
1163 b"\x81\x00\x00\x00",
1164 b"\x00",
1165 b"\x00\x00\x00\x00\x00",
1166 ]
1167 for internal, uni in ok:
1168 if sys.byteorder == "little":
1169 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001170 with support.check_warnings():
1171 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001172 for internal in not_ok:
1173 if sys.byteorder == "little":
1174 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001175 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001176 'deprecated', DeprecationWarning)):
1177 self.assertRaises(UnicodeDecodeError, internal.decode,
1178 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001179 if sys.byteorder == "little":
1180 invalid = b"\x00\x00\x11\x00"
1181 else:
1182 invalid = b"\x00\x11\x00\x00"
1183 with support.check_warnings():
1184 self.assertRaises(UnicodeDecodeError,
1185 invalid.decode, "unicode_internal")
1186 with support.check_warnings():
1187 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1188 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001189
Victor Stinner182d90d2011-09-29 19:53:55 +02001190 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001191 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001192 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001193 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001194 'deprecated', DeprecationWarning)):
1195 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001196 except UnicodeDecodeError as ex:
1197 self.assertEqual("unicode_internal", ex.encoding)
1198 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1199 self.assertEqual(4, ex.start)
1200 self.assertEqual(8, ex.end)
1201 else:
1202 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001203
Victor Stinner182d90d2011-09-29 19:53:55 +02001204 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001205 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001206 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1207 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001208 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001209 'deprecated', DeprecationWarning)):
1210 ab = "ab".encode("unicode_internal").decode()
1211 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1212 "ascii"),
1213 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001214 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001215
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001216 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001217 with support.check_warnings(('unicode_internal codec has been '
1218 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001219 # Issue 3739
1220 encoder = codecs.getencoder("unicode_internal")
1221 self.assertEqual(encoder("a")[1], 1)
1222 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1223
1224 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001225
Martin v. Löwis2548c732003-04-18 10:39:54 +00001226# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1227nameprep_tests = [
1228 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001229 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1230 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1231 b'\xb8\x8f\xef\xbb\xbf',
1232 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001234 (b'CAFE',
1235 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 # 3.3 Case folding 8bit U+00DF (german sharp s).
1237 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001238 (b'\xc3\x9f',
1239 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001241 (b'\xc4\xb0',
1242 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001243 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001244 (b'\xc5\x83\xcd\xba',
1245 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1247 # XXX: skip this as it fails in UCS-2 mode
1248 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1249 # 'telc\xe2\x88\x95kg\xcf\x83'),
1250 (None, None),
1251 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001252 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1253 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001255 (b'\xe1\xbe\xb7',
1256 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 # 3.9 Self-reverting case folding U+01F0 and normalization.
1258 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001259 (b'\xc7\xb0',
1260 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001261 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001262 (b'\xce\x90',
1263 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001265 (b'\xce\xb0',
1266 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001267 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001268 (b'\xe1\xba\x96',
1269 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001271 (b'\xe1\xbd\x96',
1272 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001274 (b' ',
1275 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001277 (b'\xc2\xa0',
1278 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001279 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001280 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281 None),
1282 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001283 (b'\xe2\x80\x80',
1284 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001285 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001286 (b'\xe2\x80\x8b',
1287 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001288 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001289 (b'\xe3\x80\x80',
1290 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001292 (b'\x10\x7f',
1293 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001294 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001295 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001296 None),
1297 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001298 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299 None),
1300 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001301 (b'\xef\xbb\xbf',
1302 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001304 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305 None),
1306 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001307 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308 None),
1309 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001310 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 None),
1312 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001313 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 None),
1315 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001316 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 None),
1318 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320 None),
1321 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001322 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001323 None),
1324 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001325 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326 None),
1327 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001328 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001329 None),
1330 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001331 (b'\xcd\x81',
1332 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001334 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335 None),
1336 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001337 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001338 None),
1339 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001340 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341 None),
1342 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001343 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001344 None),
1345 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001346 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347 None),
1348 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001349 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350 None),
1351 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001352 (b'foo\xef\xb9\xb6bar',
1353 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001355 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001356 None),
1357 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001358 (b'\xd8\xa71\xd8\xa8',
1359 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001361 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001363 # None),
1364 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.44 Larger test (shrinking).
1366 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001367 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1368 b'\xaa\xce\xb0\xe2\x80\x80',
1369 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.45 Larger test (expanding).
1371 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1373 b'\x80',
1374 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1375 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1376 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377 ]
1378
1379
1380class NameprepTest(unittest.TestCase):
1381 def test_nameprep(self):
1382 from encodings.idna import nameprep
1383 for pos, (orig, prepped) in enumerate(nameprep_tests):
1384 if orig is None:
1385 # Skipped
1386 continue
1387 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001388 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 if prepped is None:
1390 # Input contains prohibited characters
1391 self.assertRaises(UnicodeError, nameprep, orig)
1392 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001393 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001395 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001396 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001397 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001399class IDNACodecTest(unittest.TestCase):
1400 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001401 self.assertEqual(str(b"python.org", "idna"), "python.org")
1402 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1403 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1404 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001405
1406 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001407 self.assertEqual("python.org".encode("idna"), b"python.org")
1408 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1409 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1410 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001411
Martin v. Löwis8b595142005-08-25 11:03:38 +00001412 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001413 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001414 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001415 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001416
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001417 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001418 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001419 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001420 "python.org"
1421 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001422 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001423 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001424 "python.org."
1425 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001426 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001427 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001428 "pyth\xf6n.org."
1429 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001430 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001431 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001432 "pyth\xf6n.org."
1433 )
1434
1435 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001436 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1437 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1438 self.assertEqual(decoder.decode(b"rg"), "")
1439 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001440
1441 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001442 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1443 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1444 self.assertEqual(decoder.decode(b"rg."), "org.")
1445 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001446
1447 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001448 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 b"".join(codecs.iterencode("python.org", "idna")),
1450 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001451 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001452 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 b"".join(codecs.iterencode("python.org.", "idna")),
1454 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001455 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001456 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1458 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001459 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001460 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1462 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001463 )
1464
1465 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001466 self.assertEqual(encoder.encode("\xe4x"), b"")
1467 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1468 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469
1470 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001471 self.assertEqual(encoder.encode("\xe4x"), b"")
1472 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1473 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001474
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001475class CodecsModuleTest(unittest.TestCase):
1476
1477 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001478 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1479 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001480 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001481 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001482 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001483
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001484 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001485 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1486 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001487 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001488 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001489 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001490 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001491
1492 def test_register(self):
1493 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001494 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001495
1496 def test_lookup(self):
1497 self.assertRaises(TypeError, codecs.lookup)
1498 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001499 self.assertRaises(LookupError, codecs.lookup, " ")
1500
1501 def test_getencoder(self):
1502 self.assertRaises(TypeError, codecs.getencoder)
1503 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1504
1505 def test_getdecoder(self):
1506 self.assertRaises(TypeError, codecs.getdecoder)
1507 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1508
1509 def test_getreader(self):
1510 self.assertRaises(TypeError, codecs.getreader)
1511 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1512
1513 def test_getwriter(self):
1514 self.assertRaises(TypeError, codecs.getwriter)
1515 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001516
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001517 def test_lookup_issue1813(self):
1518 # Issue #1813: under Turkish locales, lookup of some codecs failed
1519 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001520 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001521 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1522 try:
1523 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1524 except locale.Error:
1525 # Unsupported locale on this system
1526 self.skipTest('test needs Turkish locale')
1527 c = codecs.lookup('ASCII')
1528 self.assertEqual(c.name, 'ascii')
1529
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001530class StreamReaderTest(unittest.TestCase):
1531
1532 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001533 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001534 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001535
1536 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001537 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001539
Thomas Wouters89f507f2006-12-13 04:49:30 +00001540class EncodedFileTest(unittest.TestCase):
1541
1542 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001543 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001544 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001546
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001547 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001548 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001549 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001551
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001552all_unicode_encodings = [
1553 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001554 "big5",
1555 "big5hkscs",
1556 "charmap",
1557 "cp037",
1558 "cp1006",
1559 "cp1026",
1560 "cp1140",
1561 "cp1250",
1562 "cp1251",
1563 "cp1252",
1564 "cp1253",
1565 "cp1254",
1566 "cp1255",
1567 "cp1256",
1568 "cp1257",
1569 "cp1258",
1570 "cp424",
1571 "cp437",
1572 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001573 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001574 "cp737",
1575 "cp775",
1576 "cp850",
1577 "cp852",
1578 "cp855",
1579 "cp856",
1580 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001581 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001582 "cp860",
1583 "cp861",
1584 "cp862",
1585 "cp863",
1586 "cp864",
1587 "cp865",
1588 "cp866",
1589 "cp869",
1590 "cp874",
1591 "cp875",
1592 "cp932",
1593 "cp949",
1594 "cp950",
1595 "euc_jis_2004",
1596 "euc_jisx0213",
1597 "euc_jp",
1598 "euc_kr",
1599 "gb18030",
1600 "gb2312",
1601 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001602 "hp_roman8",
1603 "hz",
1604 "idna",
1605 "iso2022_jp",
1606 "iso2022_jp_1",
1607 "iso2022_jp_2",
1608 "iso2022_jp_2004",
1609 "iso2022_jp_3",
1610 "iso2022_jp_ext",
1611 "iso2022_kr",
1612 "iso8859_1",
1613 "iso8859_10",
1614 "iso8859_11",
1615 "iso8859_13",
1616 "iso8859_14",
1617 "iso8859_15",
1618 "iso8859_16",
1619 "iso8859_2",
1620 "iso8859_3",
1621 "iso8859_4",
1622 "iso8859_5",
1623 "iso8859_6",
1624 "iso8859_7",
1625 "iso8859_8",
1626 "iso8859_9",
1627 "johab",
1628 "koi8_r",
1629 "koi8_u",
1630 "latin_1",
1631 "mac_cyrillic",
1632 "mac_greek",
1633 "mac_iceland",
1634 "mac_latin2",
1635 "mac_roman",
1636 "mac_turkish",
1637 "palmos",
1638 "ptcp154",
1639 "punycode",
1640 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001641 "shift_jis",
1642 "shift_jis_2004",
1643 "shift_jisx0213",
1644 "tis_620",
1645 "unicode_escape",
1646 "unicode_internal",
1647 "utf_16",
1648 "utf_16_be",
1649 "utf_16_le",
1650 "utf_7",
1651 "utf_8",
1652]
1653
1654if hasattr(codecs, "mbcs_encode"):
1655 all_unicode_encodings.append("mbcs")
1656
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001657# The following encoding is not tested, because it's not supposed
1658# to work:
1659# "undefined"
1660
1661# The following encodings don't work in stateful mode
1662broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001663 "punycode",
1664 "unicode_internal"
1665]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001666broken_incremental_coders = broken_unicode_with_streams + [
1667 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001668]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001669
Walter Dörwald3abcb012007-04-16 22:10:50 +00001670class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001671 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001672 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001673 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001674 name = codecs.lookup(encoding).name
1675 if encoding.endswith("_codec"):
1676 name += "_codec"
1677 elif encoding == "latin_1":
1678 name = "latin_1"
1679 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001680
Ezio Melottiadc417c2011-11-17 12:23:34 +02001681 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001682 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001683 (b, size) = codecs.getencoder(encoding)(s)
1684 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1685 (chars, size) = codecs.getdecoder(encoding)(b)
1686 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001687
1688 if encoding not in broken_unicode_with_streams:
1689 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001690 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001691 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001692 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001693 for c in s:
1694 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001695 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001696 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001697 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001698 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001699 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001700 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001701 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001702 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001703 decodedresult += reader.read()
1704 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1705
Thomas Wouters89f507f2006-12-13 04:49:30 +00001706 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001707 # check incremental decoder/encoder (fetched via the Python
1708 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001709 try:
1710 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001711 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001712 except LookupError: # no IncrementalEncoder
1713 pass
1714 else:
1715 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001716 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001717 for c in s:
1718 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001719 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001720 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001721 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001722 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001723 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001724 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001725 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1726
1727 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001728 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001729 for c in s:
1730 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001731 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001732 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001733 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001734 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001735 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001736 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001737 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1738
1739 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001740 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001741 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1742
1743 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001744 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1745 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001746
Victor Stinner554f3f02010-06-16 23:33:54 +00001747 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001748 # check incremental decoder/encoder with errors argument
1749 try:
1750 encoder = codecs.getincrementalencoder(encoding)("ignore")
1751 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1752 except LookupError: # no IncrementalEncoder
1753 pass
1754 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001755 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001756 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001757 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001758 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1759
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001760 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001761 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001762 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001763 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1764
Walter Dörwald729c31f2005-03-14 19:06:30 +00001765 def test_seek(self):
1766 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001767 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001768 for encoding in all_unicode_encodings:
1769 if encoding == "idna": # FIXME: See SF bug #1163178
1770 continue
1771 if encoding in broken_unicode_with_streams:
1772 continue
Victor Stinner05010702011-05-27 16:50:40 +02001773 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001774 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001775 # Test that calling seek resets the internal codec state and buffers
1776 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001777 data = reader.read()
1778 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001779
Walter Dörwalde22d3392005-11-17 08:52:34 +00001780 def test_bad_decode_args(self):
1781 for encoding in all_unicode_encodings:
1782 decoder = codecs.getdecoder(encoding)
1783 self.assertRaises(TypeError, decoder)
1784 if encoding not in ("idna", "punycode"):
1785 self.assertRaises(TypeError, decoder, 42)
1786
1787 def test_bad_encode_args(self):
1788 for encoding in all_unicode_encodings:
1789 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001790 with support.check_warnings():
1791 # unicode-internal has been deprecated
1792 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001793
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001794 def test_encoding_map_type_initialized(self):
1795 from encodings import cp1140
1796 # This used to crash, we are only verifying there's no crash.
1797 table_type = type(cp1140.encoding_table)
1798 self.assertEqual(table_type, table_type)
1799
Walter Dörwald3abcb012007-04-16 22:10:50 +00001800 def test_decoder_state(self):
1801 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001802 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001803 for encoding in all_unicode_encodings:
1804 if encoding not in broken_incremental_coders:
1805 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1806 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1807
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001808class CharmapTest(unittest.TestCase):
1809 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001810 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001811 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001812 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001813 )
1814
Ezio Melottib3aedd42010-11-20 19:04:17 +00001815 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001816 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1817 ("\U0010FFFFbc", 3)
1818 )
1819
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001820 self.assertRaises(UnicodeDecodeError,
1821 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1822 )
1823
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001824 self.assertRaises(UnicodeDecodeError,
1825 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1826 )
1827
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001828 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001829 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001830 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001831 )
1832
Ezio Melottib3aedd42010-11-20 19:04:17 +00001833 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001834 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001835 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001836 )
1837
Ezio Melottib3aedd42010-11-20 19:04:17 +00001838 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001839 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001840 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001841 )
1842
Ezio Melottib3aedd42010-11-20 19:04:17 +00001843 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001844 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001845 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001846 )
1847
Guido van Rossum805365e2007-05-07 22:24:25 +00001848 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001849 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001850 codecs.charmap_decode(allbytes, "ignore", ""),
1851 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001852 )
1853
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001854 def test_decode_with_int2str_map(self):
1855 self.assertEqual(
1856 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1857 {0: 'a', 1: 'b', 2: 'c'}),
1858 ("abc", 3)
1859 )
1860
1861 self.assertEqual(
1862 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1863 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1864 ("AaBbCc", 3)
1865 )
1866
1867 self.assertEqual(
1868 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1869 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1870 ("\U0010FFFFbc", 3)
1871 )
1872
1873 self.assertEqual(
1874 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1875 {0: 'a', 1: 'b', 2: ''}),
1876 ("ab", 3)
1877 )
1878
1879 self.assertRaises(UnicodeDecodeError,
1880 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1881 {0: 'a', 1: 'b'}
1882 )
1883
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001884 self.assertRaises(UnicodeDecodeError,
1885 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1886 {0: 'a', 1: 'b', 2: None}
1887 )
1888
1889 # Issue #14850
1890 self.assertRaises(UnicodeDecodeError,
1891 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1892 {0: 'a', 1: 'b', 2: '\ufffe'}
1893 )
1894
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001895 self.assertEqual(
1896 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1897 {0: 'a', 1: 'b'}),
1898 ("ab\ufffd", 3)
1899 )
1900
1901 self.assertEqual(
1902 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1903 {0: 'a', 1: 'b', 2: None}),
1904 ("ab\ufffd", 3)
1905 )
1906
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001907 # Issue #14850
1908 self.assertEqual(
1909 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1910 {0: 'a', 1: 'b', 2: '\ufffe'}),
1911 ("ab\ufffd", 3)
1912 )
1913
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001914 self.assertEqual(
1915 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1916 {0: 'a', 1: 'b'}),
1917 ("ab", 3)
1918 )
1919
1920 self.assertEqual(
1921 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1922 {0: 'a', 1: 'b', 2: None}),
1923 ("ab", 3)
1924 )
1925
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001926 # Issue #14850
1927 self.assertEqual(
1928 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1929 {0: 'a', 1: 'b', 2: '\ufffe'}),
1930 ("ab", 3)
1931 )
1932
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001933 allbytes = bytes(range(256))
1934 self.assertEqual(
1935 codecs.charmap_decode(allbytes, "ignore", {}),
1936 ("", len(allbytes))
1937 )
1938
1939 def test_decode_with_int2int_map(self):
1940 a = ord('a')
1941 b = ord('b')
1942 c = ord('c')
1943
1944 self.assertEqual(
1945 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1946 {0: a, 1: b, 2: c}),
1947 ("abc", 3)
1948 )
1949
1950 # Issue #15379
1951 self.assertEqual(
1952 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1953 {0: 0x10FFFF, 1: b, 2: c}),
1954 ("\U0010FFFFbc", 3)
1955 )
1956
Antoine Pitroua1f76552012-09-23 20:00:04 +02001957 self.assertEqual(
1958 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1959 {0: sys.maxunicode, 1: b, 2: c}),
1960 (chr(sys.maxunicode) + "bc", 3)
1961 )
1962
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001963 self.assertRaises(TypeError,
1964 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001965 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001966 )
1967
1968 self.assertRaises(UnicodeDecodeError,
1969 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1970 {0: a, 1: b},
1971 )
1972
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001973 self.assertRaises(UnicodeDecodeError,
1974 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1975 {0: a, 1: b, 2: 0xFFFE},
1976 )
1977
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001978 self.assertEqual(
1979 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1980 {0: a, 1: b}),
1981 ("ab\ufffd", 3)
1982 )
1983
1984 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001985 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1986 {0: a, 1: b, 2: 0xFFFE}),
1987 ("ab\ufffd", 3)
1988 )
1989
1990 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001991 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1992 {0: a, 1: b}),
1993 ("ab", 3)
1994 )
1995
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001996 self.assertEqual(
1997 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1998 {0: a, 1: b, 2: 0xFFFE}),
1999 ("ab", 3)
2000 )
2001
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002002
Thomas Wouters89f507f2006-12-13 04:49:30 +00002003class WithStmtTest(unittest.TestCase):
2004 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002005 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002006 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2007 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002008
2009 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002010 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002011 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002012 with codecs.StreamReaderWriter(f, info.streamreader,
2013 info.streamwriter, 'strict') as srw:
2014 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002015
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002016class TypesTest(unittest.TestCase):
2017 def test_decode_unicode(self):
2018 # Most decoders don't accept unicode input
2019 decoders = [
2020 codecs.utf_7_decode,
2021 codecs.utf_8_decode,
2022 codecs.utf_16_le_decode,
2023 codecs.utf_16_be_decode,
2024 codecs.utf_16_ex_decode,
2025 codecs.utf_32_decode,
2026 codecs.utf_32_le_decode,
2027 codecs.utf_32_be_decode,
2028 codecs.utf_32_ex_decode,
2029 codecs.latin_1_decode,
2030 codecs.ascii_decode,
2031 codecs.charmap_decode,
2032 ]
2033 if hasattr(codecs, "mbcs_decode"):
2034 decoders.append(codecs.mbcs_decode)
2035 for decoder in decoders:
2036 self.assertRaises(TypeError, decoder, "xxx")
2037
2038 def test_unicode_escape(self):
2039 # Escape-decoding an unicode string is supported ang gives the same
2040 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002041 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2042 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2043 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2044 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002045
Victor Stinnere3b47152011-12-09 20:49:49 +01002046 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2047 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2048
2049 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2050 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2051
Serhiy Storchakad6793772013-01-29 10:20:44 +02002052
2053class UnicodeEscapeTest(unittest.TestCase):
2054 def test_empty(self):
2055 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2056 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2057
2058 def test_raw_encode(self):
2059 encode = codecs.unicode_escape_encode
2060 for b in range(32, 127):
2061 if b != b'\\'[0]:
2062 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2063
2064 def test_raw_decode(self):
2065 decode = codecs.unicode_escape_decode
2066 for b in range(256):
2067 if b != b'\\'[0]:
2068 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2069
2070 def test_escape_encode(self):
2071 encode = codecs.unicode_escape_encode
2072 check = coding_checker(self, encode)
2073 check('\t', br'\t')
2074 check('\n', br'\n')
2075 check('\r', br'\r')
2076 check('\\', br'\\')
2077 for b in range(32):
2078 if chr(b) not in '\t\n\r':
2079 check(chr(b), ('\\x%02x' % b).encode())
2080 for b in range(127, 256):
2081 check(chr(b), ('\\x%02x' % b).encode())
2082 check('\u20ac', br'\u20ac')
2083 check('\U0001d120', br'\U0001d120')
2084
2085 def test_escape_decode(self):
2086 decode = codecs.unicode_escape_decode
2087 check = coding_checker(self, decode)
2088 check(b"[\\\n]", "[]")
2089 check(br'[\"]', '["]')
2090 check(br"[\']", "[']")
2091 check(br"[\\]", r"[\]")
2092 check(br"[\a]", "[\x07]")
2093 check(br"[\b]", "[\x08]")
2094 check(br"[\t]", "[\x09]")
2095 check(br"[\n]", "[\x0a]")
2096 check(br"[\v]", "[\x0b]")
2097 check(br"[\f]", "[\x0c]")
2098 check(br"[\r]", "[\x0d]")
2099 check(br"[\7]", "[\x07]")
2100 check(br"[\8]", r"[\8]")
2101 check(br"[\78]", "[\x078]")
2102 check(br"[\41]", "[!]")
2103 check(br"[\418]", "[!8]")
2104 check(br"[\101]", "[A]")
2105 check(br"[\1010]", "[A0]")
2106 check(br"[\x41]", "[A]")
2107 check(br"[\x410]", "[A0]")
2108 check(br"\u20ac", "\u20ac")
2109 check(br"\U0001d120", "\U0001d120")
2110 for b in range(256):
2111 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2112 check(b'\\' + bytes([b]), '\\' + chr(b))
2113
2114 def test_decode_errors(self):
2115 decode = codecs.unicode_escape_decode
2116 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2117 for i in range(d):
2118 self.assertRaises(UnicodeDecodeError, decode,
2119 b"\\" + c + b"0"*i)
2120 self.assertRaises(UnicodeDecodeError, decode,
2121 b"[\\" + c + b"0"*i + b"]")
2122 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2123 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2124 self.assertEqual(decode(data, "replace"),
2125 ("[\ufffd]\ufffd", len(data)))
2126 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2127 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2128 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2129
2130
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002131class RawUnicodeEscapeTest(unittest.TestCase):
2132 def test_empty(self):
2133 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2134 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2135
2136 def test_raw_encode(self):
2137 encode = codecs.raw_unicode_escape_encode
2138 for b in range(256):
2139 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2140
2141 def test_raw_decode(self):
2142 decode = codecs.raw_unicode_escape_decode
2143 for b in range(256):
2144 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2145
2146 def test_escape_encode(self):
2147 encode = codecs.raw_unicode_escape_encode
2148 check = coding_checker(self, encode)
2149 for b in range(256):
2150 if b not in b'uU':
2151 check('\\' + chr(b), b'\\' + bytes([b]))
2152 check('\u20ac', br'\u20ac')
2153 check('\U0001d120', br'\U0001d120')
2154
2155 def test_escape_decode(self):
2156 decode = codecs.raw_unicode_escape_decode
2157 check = coding_checker(self, decode)
2158 for b in range(256):
2159 if b not in b'uU':
2160 check(b'\\' + bytes([b]), '\\' + chr(b))
2161 check(br"\u20ac", "\u20ac")
2162 check(br"\U0001d120", "\U0001d120")
2163
2164 def test_decode_errors(self):
2165 decode = codecs.raw_unicode_escape_decode
2166 for c, d in (b'u', 4), (b'U', 4):
2167 for i in range(d):
2168 self.assertRaises(UnicodeDecodeError, decode,
2169 b"\\" + c + b"0"*i)
2170 self.assertRaises(UnicodeDecodeError, decode,
2171 b"[\\" + c + b"0"*i + b"]")
2172 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2173 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2174 self.assertEqual(decode(data, "replace"),
2175 ("[\ufffd]\ufffd", len(data)))
2176 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2177 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2178 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2179
2180
Martin v. Löwis43c57782009-05-10 08:15:24 +00002181class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002182
2183 def test_utf8(self):
2184 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002185 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002186 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002187 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002188 b"foo\x80bar")
2189 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002190 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002191 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002192 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002193 b"\xed\xb0\x80")
2194
2195 def test_ascii(self):
2196 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002197 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002198 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002199 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002200 b"foo\x80bar")
2201
2202 def test_charmap(self):
2203 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002204 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002205 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002206 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002207 b"foo\xa5bar")
2208
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002209 def test_latin1(self):
2210 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002211 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002212 b"\xe4\xeb\xef\xf6\xfc")
2213
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002214
Victor Stinner3fed0872010-05-22 02:16:27 +00002215class BomTest(unittest.TestCase):
2216 def test_seek0(self):
2217 data = "1234567890"
2218 tests = ("utf-16",
2219 "utf-16-le",
2220 "utf-16-be",
2221 "utf-32",
2222 "utf-32-le",
2223 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002224 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002225 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002226 # Check if the BOM is written only once
2227 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002228 f.write(data)
2229 f.write(data)
2230 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002231 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002232 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002233 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002234
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002235 # Check that the BOM is written after a seek(0)
2236 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2237 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002238 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002239 f.seek(0)
2240 f.write(data)
2241 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002242 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002243
2244 # (StreamWriter) Check that the BOM is written after a seek(0)
2245 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002246 f.writer.write(data[0])
2247 self.assertNotEqual(f.writer.tell(), 0)
2248 f.writer.seek(0)
2249 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002250 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002251 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002252
Victor Stinner05010702011-05-27 16:50:40 +02002253 # Check that the BOM is not written after a seek() at a position
2254 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002255 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2256 f.write(data)
2257 f.seek(f.tell())
2258 f.write(data)
2259 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002260 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002261
Victor Stinner05010702011-05-27 16:50:40 +02002262 # (StreamWriter) Check that the BOM is not written after a seek()
2263 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002264 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002265 f.writer.write(data)
2266 f.writer.seek(f.writer.tell())
2267 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002268 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002269 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002270
Victor Stinner3fed0872010-05-22 02:16:27 +00002271
Georg Brandl02524622010-12-02 18:06:51 +00002272bytes_transform_encodings = [
2273 "base64_codec",
2274 "uu_codec",
2275 "quopri_codec",
2276 "hex_codec",
2277]
2278try:
2279 import zlib
2280except ImportError:
2281 pass
2282else:
2283 bytes_transform_encodings.append("zlib_codec")
2284try:
2285 import bz2
2286except ImportError:
2287 pass
2288else:
2289 bytes_transform_encodings.append("bz2_codec")
2290
2291class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002292
Georg Brandl02524622010-12-02 18:06:51 +00002293 def test_basics(self):
2294 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002295 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002296 with self.subTest(encoding=encoding):
2297 # generic codecs interface
2298 (o, size) = codecs.getencoder(encoding)(binput)
2299 self.assertEqual(size, len(binput))
2300 (i, size) = codecs.getdecoder(encoding)(o)
2301 self.assertEqual(size, len(o))
2302 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002303
Georg Brandl02524622010-12-02 18:06:51 +00002304 def test_read(self):
2305 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002306 with self.subTest(encoding=encoding):
2307 sin = codecs.encode(b"\x80", encoding)
2308 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2309 sout = reader.read()
2310 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002311
2312 def test_readline(self):
2313 for encoding in bytes_transform_encodings:
2314 if encoding in ['uu_codec', 'zlib_codec']:
2315 continue
Nick Coghlan8b097b42013-11-13 23:49:21 +10002316 with self.subTest(encoding=encoding):
2317 sin = codecs.encode(b"\x80", encoding)
2318 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2319 sout = reader.readline()
2320 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002321
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002322 def test_buffer_api_usage(self):
2323 # We check all the transform codecs accept memoryview input
2324 # for encoding and decoding
2325 # and also that they roundtrip correctly
2326 original = b"12345\x80"
2327 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002328 with self.subTest(encoding=encoding):
2329 data = original
2330 view = memoryview(data)
2331 data = codecs.encode(data, encoding)
2332 view_encoded = codecs.encode(view, encoding)
2333 self.assertEqual(view_encoded, data)
2334 view = memoryview(data)
2335 data = codecs.decode(data, encoding)
2336 self.assertEqual(data, original)
2337 view_decoded = codecs.decode(view, encoding)
2338 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002339
Nick Coghlan8b097b42013-11-13 23:49:21 +10002340 def test_type_error_for_text_input(self):
2341 # Check binary -> binary codecs give a good error for str input
2342 bad_input = "bad input type"
2343 for encoding in bytes_transform_encodings:
2344 with self.subTest(encoding=encoding):
2345 msg = "^encoding with '{}' codec failed".format(encoding)
2346 with self.assertRaisesRegex(TypeError, msg) as failure:
2347 bad_input.encode(encoding)
2348 self.assertTrue(isinstance(failure.exception.__cause__,
2349 TypeError))
2350
2351 def test_type_error_for_binary_input(self):
2352 # Check str -> str codec gives a good error for binary input
2353 for bad_input in (b"immutable", bytearray(b"mutable")):
2354 with self.subTest(bad_input=bad_input):
2355 msg = "^decoding with 'rot_13' codec failed"
2356 with self.assertRaisesRegex(AttributeError, msg) as failure:
2357 bad_input.decode("rot_13")
2358 self.assertTrue(isinstance(failure.exception.__cause__,
2359 AttributeError))
2360
2361 def test_bad_decoding_output_type(self):
2362 # Check bytes.decode and bytearray.decode give a good error
2363 # message for binary -> binary codecs
2364 data = b"encode first to ensure we meet any format restrictions"
2365 for encoding in bytes_transform_encodings:
2366 with self.subTest(encoding=encoding):
2367 encoded_data = codecs.encode(data, encoding)
2368 fmt = ("'{}' decoder returned 'bytes' instead of 'str'; "
2369 "use codecs.decode\(\) to decode to arbitrary types")
2370 msg = fmt.format(encoding)
2371 with self.assertRaisesRegex(TypeError, msg):
2372 encoded_data.decode(encoding)
2373 with self.assertRaisesRegex(TypeError, msg):
2374 bytearray(encoded_data).decode(encoding)
2375
2376 def test_bad_encoding_output_type(self):
2377 # Check str.encode gives a good error message for str -> str codecs
2378 msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; "
2379 "use codecs.encode\(\) to encode to arbitrary types")
2380 with self.assertRaisesRegex(TypeError, msg):
2381 "just an example message".encode("rot_13")
2382
2383
2384# The codec system tries to wrap exceptions in order to ensure the error
2385# mentions the operation being performed and the codec involved. We
2386# currently *only* want this to happen for relatively stateless
2387# exceptions, where the only significant information they contain is their
2388# type and a single str argument.
2389class ExceptionChainingTest(unittest.TestCase):
2390
2391 def setUp(self):
2392 # There's no way to unregister a codec search function, so we just
2393 # ensure we render this one fairly harmless after the test
2394 # case finishes by using the test case repr as the codec name
2395 # The codecs module normalizes codec names, although this doesn't
2396 # appear to be formally documented...
2397 self.codec_name = repr(self).lower().replace(" ", "-")
2398 self.codec_info = None
2399 codecs.register(self.get_codec)
2400
2401 def get_codec(self, codec_name):
2402 if codec_name != self.codec_name:
2403 return None
2404 return self.codec_info
2405
2406 def set_codec(self, obj_to_raise):
2407 def raise_obj(*args, **kwds):
2408 raise obj_to_raise
2409 self.codec_info = codecs.CodecInfo(raise_obj, raise_obj,
2410 name=self.codec_name)
2411
2412 @contextlib.contextmanager
2413 def assertWrapped(self, operation, exc_type, msg):
2414 full_msg = "{} with '{}' codec failed \({}: {}\)".format(
2415 operation, self.codec_name, exc_type.__name__, msg)
2416 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2417 yield caught
2418
2419 def check_wrapped(self, obj_to_raise, msg):
2420 self.set_codec(obj_to_raise)
2421 with self.assertWrapped("encoding", RuntimeError, msg):
2422 "str_input".encode(self.codec_name)
2423 with self.assertWrapped("encoding", RuntimeError, msg):
2424 codecs.encode("str_input", self.codec_name)
2425 with self.assertWrapped("decoding", RuntimeError, msg):
2426 b"bytes input".decode(self.codec_name)
2427 with self.assertWrapped("decoding", RuntimeError, msg):
2428 codecs.decode(b"bytes input", self.codec_name)
2429
2430 def test_raise_by_type(self):
2431 self.check_wrapped(RuntimeError, "")
2432
2433 def test_raise_by_value(self):
2434 msg = "This should be wrapped"
2435 self.check_wrapped(RuntimeError(msg), msg)
2436
2437 @contextlib.contextmanager
Nick Coghlanc4c25802013-11-15 21:47:37 +10002438 def assertNotWrapped(self, operation, exc_type, msg_re, msg=None):
2439 if msg is None:
2440 msg = msg_re
Nick Coghlan8b097b42013-11-13 23:49:21 +10002441 with self.assertRaisesRegex(exc_type, msg) as caught:
2442 yield caught
Nick Coghlanc4c25802013-11-15 21:47:37 +10002443 self.assertEqual(str(caught.exception), msg)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002444
Nick Coghlanc4c25802013-11-15 21:47:37 +10002445 def check_not_wrapped(self, obj_to_raise, msg_re, msg=None):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002446 self.set_codec(obj_to_raise)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002447 with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002448 "str input".encode(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002449 with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002450 codecs.encode("str input", self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002451 with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002452 b"bytes input".decode(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002453 with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002454 codecs.decode(b"bytes input", self.codec_name)
2455
2456 def test_init_override_is_not_wrapped(self):
2457 class CustomInit(RuntimeError):
2458 def __init__(self):
2459 pass
2460 self.check_not_wrapped(CustomInit, "")
2461
2462 def test_new_override_is_not_wrapped(self):
2463 class CustomNew(RuntimeError):
2464 def __new__(cls):
2465 return super().__new__(cls)
2466 self.check_not_wrapped(CustomNew, "")
2467
2468 def test_instance_attribute_is_not_wrapped(self):
2469 msg = "This should NOT be wrapped"
2470 exc = RuntimeError(msg)
2471 exc.attr = 1
2472 self.check_not_wrapped(exc, msg)
2473
2474 def test_non_str_arg_is_not_wrapped(self):
2475 self.check_not_wrapped(RuntimeError(1), "1")
2476
2477 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002478 msg_re = "\('a', 'b', 'c'\)"
2479 msg = "('a', 'b', 'c')"
2480 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re, msg)
2481
2482 # http://bugs.python.org/issue19609
2483 def test_codec_lookup_failure_not_wrapped(self):
2484 msg = "unknown encoding: %s" % self.codec_name
2485 # The initial codec lookup should not be wrapped
2486 with self.assertNotWrapped("encoding", LookupError, msg):
2487 "str input".encode(self.codec_name)
2488 with self.assertNotWrapped("encoding", LookupError, msg):
2489 codecs.encode("str input", self.codec_name)
2490 with self.assertNotWrapped("decoding", LookupError, msg):
2491 b"bytes input".decode(self.codec_name)
2492 with self.assertNotWrapped("decoding", LookupError, msg):
2493 codecs.decode(b"bytes input", self.codec_name)
2494
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002495
Georg Brandl02524622010-12-02 18:06:51 +00002496
Victor Stinner62be4fb2011-10-18 21:46:37 +02002497@unittest.skipUnless(sys.platform == 'win32',
2498 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002499class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002500 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002501 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002502
Victor Stinner3a50e702011-10-18 21:21:00 +02002503 def test_invalid_code_page(self):
2504 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2505 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002506 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2507 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002508
2509 def test_code_page_name(self):
2510 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2511 codecs.code_page_encode, 932, '\xff')
2512 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2513 codecs.code_page_decode, 932, b'\x81\x00')
2514 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2515 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2516
2517 def check_decode(self, cp, tests):
2518 for raw, errors, expected in tests:
2519 if expected is not None:
2520 try:
2521 decoded = codecs.code_page_decode(cp, raw, errors)
2522 except UnicodeDecodeError as err:
2523 self.fail('Unable to decode %a from "cp%s" with '
2524 'errors=%r: %s' % (raw, cp, errors, err))
2525 self.assertEqual(decoded[0], expected,
2526 '%a.decode("cp%s", %r)=%a != %a'
2527 % (raw, cp, errors, decoded[0], expected))
2528 # assert 0 <= decoded[1] <= len(raw)
2529 self.assertGreaterEqual(decoded[1], 0)
2530 self.assertLessEqual(decoded[1], len(raw))
2531 else:
2532 self.assertRaises(UnicodeDecodeError,
2533 codecs.code_page_decode, cp, raw, errors)
2534
2535 def check_encode(self, cp, tests):
2536 for text, errors, expected in tests:
2537 if expected is not None:
2538 try:
2539 encoded = codecs.code_page_encode(cp, text, errors)
2540 except UnicodeEncodeError as err:
2541 self.fail('Unable to encode %a to "cp%s" with '
2542 'errors=%r: %s' % (text, cp, errors, err))
2543 self.assertEqual(encoded[0], expected,
2544 '%a.encode("cp%s", %r)=%a != %a'
2545 % (text, cp, errors, encoded[0], expected))
2546 self.assertEqual(encoded[1], len(text))
2547 else:
2548 self.assertRaises(UnicodeEncodeError,
2549 codecs.code_page_encode, cp, text, errors)
2550
2551 def test_cp932(self):
2552 self.check_encode(932, (
2553 ('abc', 'strict', b'abc'),
2554 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002555 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002556 ('\xff', 'strict', None),
2557 ('[\xff]', 'ignore', b'[]'),
2558 ('[\xff]', 'replace', b'[y]'),
2559 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002560 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2561 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002562 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002563 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002564 (b'abc', 'strict', 'abc'),
2565 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2566 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002567 (b'[\xff]', 'strict', None),
2568 (b'[\xff]', 'ignore', '[]'),
2569 (b'[\xff]', 'replace', '[\ufffd]'),
2570 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002571 (b'\x81\x00abc', 'strict', None),
2572 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002573 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2574 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002575
2576 def test_cp1252(self):
2577 self.check_encode(1252, (
2578 ('abc', 'strict', b'abc'),
2579 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2580 ('\xff', 'strict', b'\xff'),
2581 ('\u0141', 'strict', None),
2582 ('\u0141', 'ignore', b''),
2583 ('\u0141', 'replace', b'L'),
2584 ))
2585 self.check_decode(1252, (
2586 (b'abc', 'strict', 'abc'),
2587 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2588 (b'\xff', 'strict', '\xff'),
2589 ))
2590
2591 def test_cp_utf7(self):
2592 cp = 65000
2593 self.check_encode(cp, (
2594 ('abc', 'strict', b'abc'),
2595 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2596 ('\U0010ffff', 'strict', b'+2//f/w-'),
2597 ('\udc80', 'strict', b'+3IA-'),
2598 ('\ufffd', 'strict', b'+//0-'),
2599 ))
2600 self.check_decode(cp, (
2601 (b'abc', 'strict', 'abc'),
2602 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2603 (b'+2//f/w-', 'strict', '\U0010ffff'),
2604 (b'+3IA-', 'strict', '\udc80'),
2605 (b'+//0-', 'strict', '\ufffd'),
2606 # invalid bytes
2607 (b'[+/]', 'strict', '[]'),
2608 (b'[\xff]', 'strict', '[\xff]'),
2609 ))
2610
Victor Stinner3a50e702011-10-18 21:21:00 +02002611 def test_multibyte_encoding(self):
2612 self.check_decode(932, (
2613 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2614 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2615 ))
2616 self.check_decode(self.CP_UTF8, (
2617 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2618 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2619 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002620 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002621 self.check_encode(self.CP_UTF8, (
2622 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2623 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2624 ))
2625
2626 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002627 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2628 self.assertEqual(decoded, ('', 0))
2629
Victor Stinner3a50e702011-10-18 21:21:00 +02002630 decoded = codecs.code_page_decode(932,
2631 b'\xe9\x80\xe9', 'strict',
2632 False)
2633 self.assertEqual(decoded, ('\u9a3e', 2))
2634
2635 decoded = codecs.code_page_decode(932,
2636 b'\xe9\x80\xe9\x80', 'strict',
2637 False)
2638 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2639
2640 decoded = codecs.code_page_decode(932,
2641 b'abc', 'strict',
2642 False)
2643 self.assertEqual(decoded, ('abc', 3))
2644
2645
Fred Drake2e2be372001-09-20 21:33:42 +00002646if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002647 unittest.main()