blob: 2f3cf4d9f5d802269ad4e567006f1ccd44c9d5f2 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Serhiy Storchakad6793772013-01-29 10:20:44 +020024def coding_checker(self, coder):
25 def check(input, expect):
26 self.assertEqual(coder(input), (expect, len(input)))
27 return check
28
Walter Dörwald69652032004-09-07 20:24:22 +000029class Queue(object):
30 """
31 queue: write bytes at one end, read bytes from the other end
32 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000033 def __init__(self, buffer):
34 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000035
36 def write(self, chars):
37 self._buffer += chars
38
39 def read(self, size=-1):
40 if size<0:
41 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000043 return s
44 else:
45 s = self._buffer[:size]
46 self._buffer = self._buffer[size:]
47 return s
48
Walter Dörwald3abcb012007-04-16 22:10:50 +000049class MixInCheckStateHandling:
50 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementaldecoder(encoding)()
53 part1 = d.decode(s[:i])
54 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000055 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000056 # Check that the condition stated in the documentation for
57 # IncrementalDecoder.getstate() holds
58 if not state[1]:
59 # reset decoder to the default state without anything buffered
60 d.setstate((state[0][:0], 0))
61 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000062 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000063 # The decoder must return to the same state
64 self.assertEqual(state, d.getstate())
65 # Create a new decoder and set it to the state
66 # we extracted from the old one
67 d = codecs.getincrementaldecoder(encoding)()
68 d.setstate(state)
69 part2 = d.decode(s[i:], True)
70 self.assertEqual(u, part1+part2)
71
72 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000073 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000074 d = codecs.getincrementalencoder(encoding)()
75 part1 = d.encode(u[:i])
76 state = d.getstate()
77 d = codecs.getincrementalencoder(encoding)()
78 d.setstate(state)
79 part2 = d.encode(u[i:], True)
80 self.assertEqual(s, part1+part2)
81
Ezio Melotti5d3dba02013-01-11 06:02:07 +020082class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000084 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000085 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000086 # the StreamReader and check that the results equal the appropriate
87 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020089 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000093 result += r.read()
94 self.assertEqual(result, partialresult)
95 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000097 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000098
Thomas Woutersa9773292006-04-21 09:43:23 +000099 # do the check again, this time using a incremental decoder
100 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000101 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000103 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 self.assertEqual(result, partialresult)
105 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 self.assertEqual(d.decode(b"", True), "")
107 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000108
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000109 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000111 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 self.assertEqual(result, partialresult)
115 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 self.assertEqual(d.decode(b"", True), "")
117 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000118
119 # check iterdecode()
120 encoded = input.encode(self.encoding)
121 self.assertEqual(
122 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000123 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 )
125
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126 def test_readline(self):
127 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000128 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129 return codecs.getreader(self.encoding)(stream)
130
Walter Dörwaldca199432006-03-06 22:39:12 +0000131 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200132 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 lines = []
134 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 if not line:
137 break
138 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000141 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
142 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
143 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000144 self.assertEqual(readalllines(s, True), sexpected)
145 self.assertEqual(readalllines(s, False), sexpectednoends)
146 self.assertEqual(readalllines(s, True, 10), sexpected)
147 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148
149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000152 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
153 vw.append((i*200)*"\3042" + lineend)
154 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000155 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
156 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 for lineend in "\n \r\n \r \u2028".split():
162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
169 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000170 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000171 self.assertEqual(
172 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000173 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 )
175
176 def test_bug1175396(self):
177 s = [
178 '<%!--===================================================\r\n',
179 ' BLOG index page: show recent articles,\r\n',
180 ' today\'s articles, or articles of a specific date.\r\n',
181 '========================================================--%>\r\n',
182 '<%@inputencoding="ISO-8859-1"%>\r\n',
183 '<%@pagetemplate=TEMPLATE.y%>\r\n',
184 '<%@import=import frog.util, frog%>\r\n',
185 '<%@import=import frog.objects%>\r\n',
186 '<%@import=from frog.storageerrors import StorageError%>\r\n',
187 '<%\r\n',
188 '\r\n',
189 'import logging\r\n',
190 'log=logging.getLogger("Snakelets.logger")\r\n',
191 '\r\n',
192 '\r\n',
193 'user=self.SessionCtx.user\r\n',
194 'storageEngine=self.SessionCtx.storageEngine\r\n',
195 '\r\n',
196 '\r\n',
197 'def readArticlesFromDate(date, count=None):\r\n',
198 ' entryids=storageEngine.listBlogEntries(date)\r\n',
199 ' entryids.reverse() # descending\r\n',
200 ' if count:\r\n',
201 ' entryids=entryids[:count]\r\n',
202 ' try:\r\n',
203 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
204 ' except StorageError,x:\r\n',
205 ' log.error("Error loading articles: "+str(x))\r\n',
206 ' self.abort("cannot load articles")\r\n',
207 '\r\n',
208 'showdate=None\r\n',
209 '\r\n',
210 'arg=self.Request.getArg()\r\n',
211 'if arg=="today":\r\n',
212 ' #-------------------- TODAY\'S ARTICLES\r\n',
213 ' self.write("<h2>Today\'s articles</h2>")\r\n',
214 ' showdate = frog.util.isodatestr() \r\n',
215 ' entries = readArticlesFromDate(showdate)\r\n',
216 'elif arg=="active":\r\n',
217 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
218 ' self.Yredirect("active.y")\r\n',
219 'elif arg=="login":\r\n',
220 ' #-------------------- LOGIN PAGE redirect\r\n',
221 ' self.Yredirect("login.y")\r\n',
222 'elif arg=="date":\r\n',
223 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
224 ' showdate = self.Request.getParameter("date")\r\n',
225 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
226 ' entries = readArticlesFromDate(showdate)\r\n',
227 'else:\r\n',
228 ' #-------------------- RECENT ARTICLES\r\n',
229 ' self.write("<h2>Recent articles</h2>")\r\n',
230 ' dates=storageEngine.listBlogEntryDates()\r\n',
231 ' if dates:\r\n',
232 ' entries=[]\r\n',
233 ' SHOWAMOUNT=10\r\n',
234 ' for showdate in dates:\r\n',
235 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
236 ' if len(entries)>=SHOWAMOUNT:\r\n',
237 ' break\r\n',
238 ' \r\n',
239 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000240 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200241 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000242 for (i, line) in enumerate(reader):
243 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000246 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200247 writer = codecs.getwriter(self.encoding)(q)
248 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
250 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 writer.write("foo\r")
252 self.assertEqual(reader.readline(keepends=False), "foo")
253 writer.write("\nbar\r")
254 self.assertEqual(reader.readline(keepends=False), "")
255 self.assertEqual(reader.readline(keepends=False), "bar")
256 writer.write("baz")
257 self.assertEqual(reader.readline(keepends=False), "baz")
258 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259
260 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 writer.write("foo\r")
262 self.assertEqual(reader.readline(keepends=True), "foo\r")
263 writer.write("\nbar\r")
264 self.assertEqual(reader.readline(keepends=True), "\n")
265 self.assertEqual(reader.readline(keepends=True), "bar\r")
266 writer.write("baz")
267 self.assertEqual(reader.readline(keepends=True), "baz")
268 self.assertEqual(reader.readline(keepends=True), "")
269 writer.write("foo\r\n")
270 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000271
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000273 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
274 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
275 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000276
277 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000278 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200279 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000280 self.assertEqual(reader.readline(), s1)
281 self.assertEqual(reader.readline(), s2)
282 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000283 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000284
285 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000286 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
287 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
288 s3 = "stillokay:bbbbxx\r\n"
289 s4 = "broken!!!!badbad\r\n"
290 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000291
292 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000293 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200294 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000295 self.assertEqual(reader.readline(), s1)
296 self.assertEqual(reader.readline(), s2)
297 self.assertEqual(reader.readline(), s3)
298 self.assertEqual(reader.readline(), s4)
299 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000300 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000301
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200302class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303 encoding = "utf-32"
304
305 spamle = (b'\xff\xfe\x00\x00'
306 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
307 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
308 spambe = (b'\x00\x00\xfe\xff'
309 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
310 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
311
312 def test_only_one_bom(self):
313 _,_,reader,writer = codecs.lookup(self.encoding)
314 # encode some stream
315 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200316 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 f.write("spam")
318 f.write("spam")
319 d = s.getvalue()
320 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000321 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000322 # try to read it back
323 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000325 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000326
327 def test_badbom(self):
328 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200329 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000330 self.assertRaises(UnicodeError, f.read)
331
332 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200333 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 self.assertRaises(UnicodeError, f.read)
335
336 def test_partial(self):
337 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200338 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000339 [
340 "", # first byte of BOM read
341 "", # second byte of BOM read
342 "", # third byte of BOM read
343 "", # fourth byte of BOM read => byteorder known
344 "",
345 "",
346 "",
347 "\x00",
348 "\x00",
349 "\x00",
350 "\x00",
351 "\x00\xff",
352 "\x00\xff",
353 "\x00\xff",
354 "\x00\xff",
355 "\x00\xff\u0100",
356 "\x00\xff\u0100",
357 "\x00\xff\u0100",
358 "\x00\xff\u0100",
359 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200360 "\x00\xff\u0100\uffff",
361 "\x00\xff\u0100\uffff",
362 "\x00\xff\u0100\uffff",
363 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 ]
365 )
366
Georg Brandl791f4e12009-09-17 11:41:24 +0000367 def test_handlers(self):
368 self.assertEqual(('\ufffd', 1),
369 codecs.utf_32_decode(b'\x01', 'replace', True))
370 self.assertEqual(('', 1),
371 codecs.utf_32_decode(b'\x01', 'ignore', True))
372
Walter Dörwald41980ca2007-08-16 21:55:45 +0000373 def test_errors(self):
374 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
375 b"\xff", "strict", True)
376
377 def test_decoder_state(self):
378 self.check_state_handling_decode(self.encoding,
379 "spamspam", self.spamle)
380 self.check_state_handling_decode(self.encoding,
381 "spamspam", self.spambe)
382
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000383 def test_issue8941(self):
384 # Issue #8941: insufficient result allocation when decoding into
385 # surrogate pairs on UCS-2 builds.
386 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
387 self.assertEqual('\U00010000' * 1024,
388 codecs.utf_32_decode(encoded_le)[0])
389 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
390 self.assertEqual('\U00010000' * 1024,
391 codecs.utf_32_decode(encoded_be)[0])
392
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200393class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000394 encoding = "utf-32-le"
395
396 def test_partial(self):
397 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200398 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 [
400 "",
401 "",
402 "",
403 "\x00",
404 "\x00",
405 "\x00",
406 "\x00",
407 "\x00\xff",
408 "\x00\xff",
409 "\x00\xff",
410 "\x00\xff",
411 "\x00\xff\u0100",
412 "\x00\xff\u0100",
413 "\x00\xff\u0100",
414 "\x00\xff\u0100",
415 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200416 "\x00\xff\u0100\uffff",
417 "\x00\xff\u0100\uffff",
418 "\x00\xff\u0100\uffff",
419 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420 ]
421 )
422
423 def test_simple(self):
424 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
425
426 def test_errors(self):
427 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
428 b"\xff", "strict", True)
429
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000430 def test_issue8941(self):
431 # Issue #8941: insufficient result allocation when decoding into
432 # surrogate pairs on UCS-2 builds.
433 encoded = b'\x00\x00\x01\x00' * 1024
434 self.assertEqual('\U00010000' * 1024,
435 codecs.utf_32_le_decode(encoded)[0])
436
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200437class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 encoding = "utf-32-be"
439
440 def test_partial(self):
441 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000443 [
444 "",
445 "",
446 "",
447 "\x00",
448 "\x00",
449 "\x00",
450 "\x00",
451 "\x00\xff",
452 "\x00\xff",
453 "\x00\xff",
454 "\x00\xff",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100",
458 "\x00\xff\u0100",
459 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200460 "\x00\xff\u0100\uffff",
461 "\x00\xff\u0100\uffff",
462 "\x00\xff\u0100\uffff",
463 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000464 ]
465 )
466
467 def test_simple(self):
468 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
469
470 def test_errors(self):
471 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
472 b"\xff", "strict", True)
473
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000474 def test_issue8941(self):
475 # Issue #8941: insufficient result allocation when decoding into
476 # surrogate pairs on UCS-2 builds.
477 encoded = b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_be_decode(encoded)[0])
480
481
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200482class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000484
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000485 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
486 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000487
488 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000489 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000493 f.write("spam")
494 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000495 d = s.getvalue()
496 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000497 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000498 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000499 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200500 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000501 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000502
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000503 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000504 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200505 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000506 self.assertRaises(UnicodeError, f.read)
507
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000508 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200509 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000510 self.assertRaises(UnicodeError, f.read)
511
Walter Dörwald69652032004-09-07 20:24:22 +0000512 def test_partial(self):
513 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200514 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000515 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 "", # first byte of BOM read
517 "", # second byte of BOM read => byteorder known
518 "",
519 "\x00",
520 "\x00",
521 "\x00\xff",
522 "\x00\xff",
523 "\x00\xff\u0100",
524 "\x00\xff\u0100",
525 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff",
527 "\x00\xff\u0100\uffff",
528 "\x00\xff\u0100\uffff",
529 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 ]
531 )
532
Georg Brandl791f4e12009-09-17 11:41:24 +0000533 def test_handlers(self):
534 self.assertEqual(('\ufffd', 1),
535 codecs.utf_16_decode(b'\x01', 'replace', True))
536 self.assertEqual(('', 1),
537 codecs.utf_16_decode(b'\x01', 'ignore', True))
538
Walter Dörwalde22d3392005-11-17 08:52:34 +0000539 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000540 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000541 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000542
543 def test_decoder_state(self):
544 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000545 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000546 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000548
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000549 def test_bug691291(self):
550 # Files are always opened in binary mode, even if no binary mode was
551 # specified. This means that no automatic conversion of '\n' is done
552 # on reading and writing.
553 s1 = 'Hello\r\nworld\r\n'
554
555 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200556 self.addCleanup(support.unlink, support.TESTFN)
557 with open(support.TESTFN, 'wb') as fp:
558 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200559 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200560 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000561
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200562class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000564
565 def test_partial(self):
566 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200567 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000568 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 "",
570 "\x00",
571 "\x00",
572 "\x00\xff",
573 "\x00\xff",
574 "\x00\xff\u0100",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200577 "\x00\xff\u0100\uffff",
578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff",
580 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000581 ]
582 )
583
Walter Dörwalde22d3392005-11-17 08:52:34 +0000584 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200585 tests = [
586 (b'\xff', '\ufffd'),
587 (b'A\x00Z', 'A\ufffd'),
588 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
589 (b'\x00\xd8', '\ufffd'),
590 (b'\x00\xd8A', '\ufffd'),
591 (b'\x00\xd8A\x00', '\ufffdA'),
592 (b'\x00\xdcA\x00', '\ufffdA'),
593 ]
594 for raw, expected in tests:
595 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
596 raw, 'strict', True)
597 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598
Victor Stinner53a9dd72010-12-08 22:25:45 +0000599 def test_nonbmp(self):
600 self.assertEqual("\U00010203".encode(self.encoding),
601 b'\x00\xd8\x03\xde')
602 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
603 "\U00010203")
604
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200605class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000606 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000607
608 def test_partial(self):
609 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200610 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000611 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200628 tests = [
629 (b'\xff', '\ufffd'),
630 (b'\x00A\xff', 'A\ufffd'),
631 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
632 (b'\xd8\x00', '\ufffd'),
633 (b'\xd8\x00\xdc', '\ufffd'),
634 (b'\xd8\x00\x00A', '\ufffdA'),
635 (b'\xdc\x00\x00A', '\ufffdA'),
636 ]
637 for raw, expected in tests:
638 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
639 raw, 'strict', True)
640 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000641
Victor Stinner53a9dd72010-12-08 22:25:45 +0000642 def test_nonbmp(self):
643 self.assertEqual("\U00010203".encode(self.encoding),
644 b'\xd8\x00\xde\x03')
645 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
646 "\U00010203")
647
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200648class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000649 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000650
651 def test_partial(self):
652 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200653 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000654 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000655 "\x00",
656 "\x00",
657 "\x00\xff",
658 "\x00\xff",
659 "\x00\xff\u07ff",
660 "\x00\xff\u07ff",
661 "\x00\xff\u07ff",
662 "\x00\xff\u07ff\u0800",
663 "\x00\xff\u07ff\u0800",
664 "\x00\xff\u07ff\u0800",
665 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200666 "\x00\xff\u07ff\u0800\uffff",
667 "\x00\xff\u07ff\u0800\uffff",
668 "\x00\xff\u07ff\u0800\uffff",
669 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000670 ]
671 )
672
Walter Dörwald3abcb012007-04-16 22:10:50 +0000673 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000674 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000675 self.check_state_handling_decode(self.encoding,
676 u, u.encode(self.encoding))
677
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000678 def test_lone_surrogates(self):
679 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
680 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000681 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
682 b'[\\udc80]')
683 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
684 b'[&#56448;]')
685 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
686 b'[\x80]')
687 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
688 b'[]')
689 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
690 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000691
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000692 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000693 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
694 b"abc\xed\xa0\x80def")
695 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
696 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
698 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
699 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
700 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000701 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700702 with self.assertRaises(UnicodeDecodeError):
703 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200704 with self.assertRaises(UnicodeDecodeError):
705 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000706
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200707@unittest.skipUnless(sys.platform == 'win32',
708 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200709class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200710 encoding = "cp65001"
711
712 def test_encode(self):
713 tests = [
714 ('abc', 'strict', b'abc'),
715 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
716 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
717 ]
718 if VISTA_OR_LATER:
719 tests.extend((
720 ('\udc80', 'strict', None),
721 ('\udc80', 'ignore', b''),
722 ('\udc80', 'replace', b'?'),
723 ('\udc80', 'backslashreplace', b'\\udc80'),
724 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
725 ))
726 else:
727 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
728 for text, errors, expected in tests:
729 if expected is not None:
730 try:
731 encoded = text.encode('cp65001', errors)
732 except UnicodeEncodeError as err:
733 self.fail('Unable to encode %a to cp65001 with '
734 'errors=%r: %s' % (text, errors, err))
735 self.assertEqual(encoded, expected,
736 '%a.encode("cp65001", %r)=%a != %a'
737 % (text, errors, encoded, expected))
738 else:
739 self.assertRaises(UnicodeEncodeError,
740 text.encode, "cp65001", errors)
741
742 def test_decode(self):
743 tests = [
744 (b'abc', 'strict', 'abc'),
745 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
746 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
747 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
748 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
749 # invalid bytes
750 (b'[\xff]', 'strict', None),
751 (b'[\xff]', 'ignore', '[]'),
752 (b'[\xff]', 'replace', '[\ufffd]'),
753 (b'[\xff]', 'surrogateescape', '[\udcff]'),
754 ]
755 if VISTA_OR_LATER:
756 tests.extend((
757 (b'[\xed\xb2\x80]', 'strict', None),
758 (b'[\xed\xb2\x80]', 'ignore', '[]'),
759 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
760 ))
761 else:
762 tests.extend((
763 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
764 ))
765 for raw, errors, expected in tests:
766 if expected is not None:
767 try:
768 decoded = raw.decode('cp65001', errors)
769 except UnicodeDecodeError as err:
770 self.fail('Unable to decode %a from cp65001 with '
771 'errors=%r: %s' % (raw, errors, err))
772 self.assertEqual(decoded, expected,
773 '%a.decode("cp65001", %r)=%a != %a'
774 % (raw, errors, decoded, expected))
775 else:
776 self.assertRaises(UnicodeDecodeError,
777 raw.decode, 'cp65001', errors)
778
779 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
780 def test_lone_surrogates(self):
781 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
782 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
783 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
784 b'[\\udc80]')
785 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
786 b'[&#56448;]')
787 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
788 b'[\x80]')
789 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
790 b'[]')
791 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
792 b'[?]')
793
794 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
795 def test_surrogatepass_handler(self):
796 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
797 b"abc\xed\xa0\x80def")
798 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
799 "abc\ud800def")
800 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
801 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
802 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
803 "\U00010fff\uD800")
804 self.assertTrue(codecs.lookup_error("surrogatepass"))
805
806
807
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200808class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000809 encoding = "utf-7"
810
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000811 def test_partial(self):
812 self.check_partial(
813 "a+-b",
814 [
815 "a",
816 "a",
817 "a+",
818 "a+-",
819 "a+-b",
820 ]
821 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000822
823class UTF16ExTest(unittest.TestCase):
824
825 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000826 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000827
828 def test_bad_args(self):
829 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
830
831class ReadBufferTest(unittest.TestCase):
832
833 def test_array(self):
834 import array
835 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000836 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000837 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000838 )
839
840 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000841 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000842
843 def test_bad_args(self):
844 self.assertRaises(TypeError, codecs.readbuffer_encode)
845 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
846
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200847class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000848 encoding = "utf-8-sig"
849
850 def test_partial(self):
851 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200852 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000853 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000854 "",
855 "",
856 "", # First BOM has been read and skipped
857 "",
858 "",
859 "\ufeff", # Second BOM has been read and emitted
860 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000861 "\ufeff\x00", # First byte of encoded "\xff" read
862 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
863 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
864 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000865 "\ufeff\x00\xff\u07ff",
866 "\ufeff\x00\xff\u07ff",
867 "\ufeff\x00\xff\u07ff\u0800",
868 "\ufeff\x00\xff\u07ff\u0800",
869 "\ufeff\x00\xff\u07ff\u0800",
870 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200871 "\ufeff\x00\xff\u07ff\u0800\uffff",
872 "\ufeff\x00\xff\u07ff\u0800\uffff",
873 "\ufeff\x00\xff\u07ff\u0800\uffff",
874 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000875 ]
876 )
877
Thomas Wouters89f507f2006-12-13 04:49:30 +0000878 def test_bug1601501(self):
879 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000880 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000881
Walter Dörwald3abcb012007-04-16 22:10:50 +0000882 def test_bom(self):
883 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000885 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
886
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000887 def test_stream_bom(self):
888 unistring = "ABC\u00A1\u2200XYZ"
889 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
890
891 reader = codecs.getreader("utf-8-sig")
892 for sizehint in [None] + list(range(1, 11)) + \
893 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200894 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000895 ostream = io.StringIO()
896 while 1:
897 if sizehint is not None:
898 data = istream.read(sizehint)
899 else:
900 data = istream.read()
901
902 if not data:
903 break
904 ostream.write(data)
905
906 got = ostream.getvalue()
907 self.assertEqual(got, unistring)
908
909 def test_stream_bare(self):
910 unistring = "ABC\u00A1\u2200XYZ"
911 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
912
913 reader = codecs.getreader("utf-8-sig")
914 for sizehint in [None] + list(range(1, 11)) + \
915 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200916 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000917 ostream = io.StringIO()
918 while 1:
919 if sizehint is not None:
920 data = istream.read(sizehint)
921 else:
922 data = istream.read()
923
924 if not data:
925 break
926 ostream.write(data)
927
928 got = ostream.getvalue()
929 self.assertEqual(got, unistring)
930
931class EscapeDecodeTest(unittest.TestCase):
932 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200933 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000934
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200935 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200936 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200937 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200938 b = bytes([b])
939 if b != b'\\':
940 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200941
942 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200943 decode = codecs.escape_decode
944 check = coding_checker(self, decode)
945 check(b"[\\\n]", b"[]")
946 check(br'[\"]', b'["]')
947 check(br"[\']", b"[']")
948 check(br"[\\]", br"[\]")
949 check(br"[\a]", b"[\x07]")
950 check(br"[\b]", b"[\x08]")
951 check(br"[\t]", b"[\x09]")
952 check(br"[\n]", b"[\x0a]")
953 check(br"[\v]", b"[\x0b]")
954 check(br"[\f]", b"[\x0c]")
955 check(br"[\r]", b"[\x0d]")
956 check(br"[\7]", b"[\x07]")
957 check(br"[\8]", br"[\8]")
958 check(br"[\78]", b"[\x078]")
959 check(br"[\41]", b"[!]")
960 check(br"[\418]", b"[!8]")
961 check(br"[\101]", b"[A]")
962 check(br"[\1010]", b"[A0]")
963 check(br"[\501]", b"[A]")
964 check(br"[\x41]", b"[A]")
965 check(br"[\X41]", br"[\X41]")
966 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200967 for b in range(256):
968 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200969 b = bytes([b])
970 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200971
972 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200973 decode = codecs.escape_decode
974 self.assertRaises(ValueError, decode, br"\x")
975 self.assertRaises(ValueError, decode, br"[\x]")
976 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
977 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
978 self.assertRaises(ValueError, decode, br"\x0")
979 self.assertRaises(ValueError, decode, br"[\x0]")
980 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
981 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200982
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000983class RecodingTest(unittest.TestCase):
984 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000985 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200986 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000987 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000988 f2.close()
989 # Python used to crash on this at exit because of a refcount
990 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000991
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992# From RFC 3492
993punycode_testcases = [
994 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000995 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
996 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000997 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000998 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000999 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001000 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001001 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001002 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001003 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1006 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1007 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001008 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001010 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1011 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1012 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001013 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001015 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001016 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1017 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1018 "\u0939\u0948\u0902",
1019 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020
1021 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001022 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001023 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1024 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025
1026 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001027 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1028 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1029 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001030 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1031 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032
1033 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1035 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1036 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1037 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001038 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039
1040 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001041 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1042 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1043 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1044 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1045 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001046 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047
1048 # (K) Vietnamese:
1049 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1050 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001051 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1052 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1053 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1054 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001055 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001058 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001059 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001060
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001062 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1063 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1064 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001065 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066
1067 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1069 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1070 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001071 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001072
1073 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001074 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001075 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076
1077 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001078 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1079 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001080 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001081
1082 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001083 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001084 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085
1086 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001087 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001088 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089
1090 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001091 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1092 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001093 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094 ]
1095
1096for i in punycode_testcases:
1097 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001098 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001099
1100class PunycodeTest(unittest.TestCase):
1101 def test_encode(self):
1102 for uni, puny in punycode_testcases:
1103 # Need to convert both strings to lower case, since
1104 # some of the extended encodings use upper case, but our
1105 # code produces only lower case. Converting just puny to
1106 # lower is also insufficient, since some of the input characters
1107 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001108 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001109 str(uni.encode("punycode"), "ascii").lower(),
1110 str(puny, "ascii").lower()
1111 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001112
1113 def test_decode(self):
1114 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001115 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001116 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001117 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001119class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001120 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001121 def test_bug1251300(self):
1122 # Decoding with unicode_internal used to not correctly handle "code
1123 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001124 ok = [
1125 (b"\x00\x10\xff\xff", "\U0010ffff"),
1126 (b"\x00\x00\x01\x01", "\U00000101"),
1127 (b"", ""),
1128 ]
1129 not_ok = [
1130 b"\x7f\xff\xff\xff",
1131 b"\x80\x00\x00\x00",
1132 b"\x81\x00\x00\x00",
1133 b"\x00",
1134 b"\x00\x00\x00\x00\x00",
1135 ]
1136 for internal, uni in ok:
1137 if sys.byteorder == "little":
1138 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001139 with support.check_warnings():
1140 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001141 for internal in not_ok:
1142 if sys.byteorder == "little":
1143 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001144 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001145 'deprecated', DeprecationWarning)):
1146 self.assertRaises(UnicodeDecodeError, internal.decode,
1147 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001148 if sys.byteorder == "little":
1149 invalid = b"\x00\x00\x11\x00"
1150 else:
1151 invalid = b"\x00\x11\x00\x00"
1152 with support.check_warnings():
1153 self.assertRaises(UnicodeDecodeError,
1154 invalid.decode, "unicode_internal")
1155 with support.check_warnings():
1156 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1157 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001158
Victor Stinner182d90d2011-09-29 19:53:55 +02001159 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001160 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001161 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001162 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001163 'deprecated', DeprecationWarning)):
1164 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001165 except UnicodeDecodeError as ex:
1166 self.assertEqual("unicode_internal", ex.encoding)
1167 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1168 self.assertEqual(4, ex.start)
1169 self.assertEqual(8, ex.end)
1170 else:
1171 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001172
Victor Stinner182d90d2011-09-29 19:53:55 +02001173 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001174 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001175 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1176 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001177 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001178 'deprecated', DeprecationWarning)):
1179 ab = "ab".encode("unicode_internal").decode()
1180 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1181 "ascii"),
1182 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001183 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001184
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001185 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001186 with support.check_warnings(('unicode_internal codec has been '
1187 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001188 # Issue 3739
1189 encoder = codecs.getencoder("unicode_internal")
1190 self.assertEqual(encoder("a")[1], 1)
1191 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1192
1193 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001194
Martin v. Löwis2548c732003-04-18 10:39:54 +00001195# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1196nameprep_tests = [
1197 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001198 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1199 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1200 b'\xb8\x8f\xef\xbb\xbf',
1201 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001202 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001203 (b'CAFE',
1204 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205 # 3.3 Case folding 8bit U+00DF (german sharp s).
1206 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001207 (b'\xc3\x9f',
1208 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001209 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001210 (b'\xc4\xb0',
1211 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001213 (b'\xc5\x83\xcd\xba',
1214 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1216 # XXX: skip this as it fails in UCS-2 mode
1217 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1218 # 'telc\xe2\x88\x95kg\xcf\x83'),
1219 (None, None),
1220 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001221 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1222 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001224 (b'\xe1\xbe\xb7',
1225 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001226 # 3.9 Self-reverting case folding U+01F0 and normalization.
1227 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001228 (b'\xc7\xb0',
1229 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001231 (b'\xce\x90',
1232 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001234 (b'\xce\xb0',
1235 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001237 (b'\xe1\xba\x96',
1238 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001240 (b'\xe1\xbd\x96',
1241 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001243 (b' ',
1244 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001246 (b'\xc2\xa0',
1247 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001248 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001249 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001250 None),
1251 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001252 (b'\xe2\x80\x80',
1253 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001255 (b'\xe2\x80\x8b',
1256 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001258 (b'\xe3\x80\x80',
1259 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001261 (b'\x10\x7f',
1262 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001264 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265 None),
1266 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001267 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001268 None),
1269 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001270 (b'\xef\xbb\xbf',
1271 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001273 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 None),
1275 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001276 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001277 None),
1278 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001279 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280 None),
1281 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001282 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283 None),
1284 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286 None),
1287 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289 None),
1290 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001291 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292 None),
1293 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001294 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001295 None),
1296 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001297 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298 None),
1299 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001300 (b'\xcd\x81',
1301 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001303 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304 None),
1305 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001306 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307 None),
1308 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001309 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 None),
1311 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001312 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001313 None),
1314 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316 None),
1317 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001318 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001319 None),
1320 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001321 (b'foo\xef\xb9\xb6bar',
1322 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001323 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001324 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 None),
1326 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001327 (b'\xd8\xa71\xd8\xa8',
1328 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001329 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001330 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001331 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001332 # None),
1333 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334 # 3.44 Larger test (shrinking).
1335 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001336 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1337 b'\xaa\xce\xb0\xe2\x80\x80',
1338 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339 # 3.45 Larger test (expanding).
1340 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001341 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1342 b'\x80',
1343 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1344 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1345 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346 ]
1347
1348
1349class NameprepTest(unittest.TestCase):
1350 def test_nameprep(self):
1351 from encodings.idna import nameprep
1352 for pos, (orig, prepped) in enumerate(nameprep_tests):
1353 if orig is None:
1354 # Skipped
1355 continue
1356 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001357 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 if prepped is None:
1359 # Input contains prohibited characters
1360 self.assertRaises(UnicodeError, nameprep, orig)
1361 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001362 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001364 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001365 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001366 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001368class IDNACodecTest(unittest.TestCase):
1369 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001370 self.assertEqual(str(b"python.org", "idna"), "python.org")
1371 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1372 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1373 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001374
1375 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001376 self.assertEqual("python.org".encode("idna"), b"python.org")
1377 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1378 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1379 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001380
Martin v. Löwis8b595142005-08-25 11:03:38 +00001381 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001382 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001383 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001384 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001385
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001386 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001387 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001388 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001389 "python.org"
1390 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001391 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001392 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001393 "python.org."
1394 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001395 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001396 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001397 "pyth\xf6n.org."
1398 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001399 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001400 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001401 "pyth\xf6n.org."
1402 )
1403
1404 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001405 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1406 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1407 self.assertEqual(decoder.decode(b"rg"), "")
1408 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001409
1410 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001411 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1412 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1413 self.assertEqual(decoder.decode(b"rg."), "org.")
1414 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001415
1416 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001417 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001418 b"".join(codecs.iterencode("python.org", "idna")),
1419 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001420 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001421 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 b"".join(codecs.iterencode("python.org.", "idna")),
1423 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001424 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001425 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1427 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001428 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001429 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001430 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1431 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432 )
1433
1434 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001435 self.assertEqual(encoder.encode("\xe4x"), b"")
1436 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1437 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001438
1439 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001440 self.assertEqual(encoder.encode("\xe4x"), b"")
1441 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1442 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001444class CodecsModuleTest(unittest.TestCase):
1445
1446 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001447 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1448 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001449 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001450 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001451 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001452
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001453 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001454 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1455 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001456 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001457 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001458 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001459 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001460
1461 def test_register(self):
1462 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001463 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001464
1465 def test_lookup(self):
1466 self.assertRaises(TypeError, codecs.lookup)
1467 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001468 self.assertRaises(LookupError, codecs.lookup, " ")
1469
1470 def test_getencoder(self):
1471 self.assertRaises(TypeError, codecs.getencoder)
1472 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1473
1474 def test_getdecoder(self):
1475 self.assertRaises(TypeError, codecs.getdecoder)
1476 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1477
1478 def test_getreader(self):
1479 self.assertRaises(TypeError, codecs.getreader)
1480 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1481
1482 def test_getwriter(self):
1483 self.assertRaises(TypeError, codecs.getwriter)
1484 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001485
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001486 def test_lookup_issue1813(self):
1487 # Issue #1813: under Turkish locales, lookup of some codecs failed
1488 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001489 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001490 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1491 try:
1492 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1493 except locale.Error:
1494 # Unsupported locale on this system
1495 self.skipTest('test needs Turkish locale')
1496 c = codecs.lookup('ASCII')
1497 self.assertEqual(c.name, 'ascii')
1498
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001499class StreamReaderTest(unittest.TestCase):
1500
1501 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001502 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001503 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001504
1505 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001506 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001507 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001508
Thomas Wouters89f507f2006-12-13 04:49:30 +00001509class EncodedFileTest(unittest.TestCase):
1510
1511 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001512 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001513 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001514 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001515
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001516 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001517 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001518 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001519 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001520
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001521all_unicode_encodings = [
1522 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001523 "big5",
1524 "big5hkscs",
1525 "charmap",
1526 "cp037",
1527 "cp1006",
1528 "cp1026",
1529 "cp1140",
1530 "cp1250",
1531 "cp1251",
1532 "cp1252",
1533 "cp1253",
1534 "cp1254",
1535 "cp1255",
1536 "cp1256",
1537 "cp1257",
1538 "cp1258",
1539 "cp424",
1540 "cp437",
1541 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001542 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001543 "cp737",
1544 "cp775",
1545 "cp850",
1546 "cp852",
1547 "cp855",
1548 "cp856",
1549 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001550 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001551 "cp860",
1552 "cp861",
1553 "cp862",
1554 "cp863",
1555 "cp864",
1556 "cp865",
1557 "cp866",
1558 "cp869",
1559 "cp874",
1560 "cp875",
1561 "cp932",
1562 "cp949",
1563 "cp950",
1564 "euc_jis_2004",
1565 "euc_jisx0213",
1566 "euc_jp",
1567 "euc_kr",
1568 "gb18030",
1569 "gb2312",
1570 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001571 "hp_roman8",
1572 "hz",
1573 "idna",
1574 "iso2022_jp",
1575 "iso2022_jp_1",
1576 "iso2022_jp_2",
1577 "iso2022_jp_2004",
1578 "iso2022_jp_3",
1579 "iso2022_jp_ext",
1580 "iso2022_kr",
1581 "iso8859_1",
1582 "iso8859_10",
1583 "iso8859_11",
1584 "iso8859_13",
1585 "iso8859_14",
1586 "iso8859_15",
1587 "iso8859_16",
1588 "iso8859_2",
1589 "iso8859_3",
1590 "iso8859_4",
1591 "iso8859_5",
1592 "iso8859_6",
1593 "iso8859_7",
1594 "iso8859_8",
1595 "iso8859_9",
1596 "johab",
1597 "koi8_r",
1598 "koi8_u",
1599 "latin_1",
1600 "mac_cyrillic",
1601 "mac_greek",
1602 "mac_iceland",
1603 "mac_latin2",
1604 "mac_roman",
1605 "mac_turkish",
1606 "palmos",
1607 "ptcp154",
1608 "punycode",
1609 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001610 "shift_jis",
1611 "shift_jis_2004",
1612 "shift_jisx0213",
1613 "tis_620",
1614 "unicode_escape",
1615 "unicode_internal",
1616 "utf_16",
1617 "utf_16_be",
1618 "utf_16_le",
1619 "utf_7",
1620 "utf_8",
1621]
1622
1623if hasattr(codecs, "mbcs_encode"):
1624 all_unicode_encodings.append("mbcs")
1625
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001626# The following encoding is not tested, because it's not supposed
1627# to work:
1628# "undefined"
1629
1630# The following encodings don't work in stateful mode
1631broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001632 "punycode",
1633 "unicode_internal"
1634]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001635broken_incremental_coders = broken_unicode_with_streams + [
1636 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001637]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001638
Walter Dörwald3abcb012007-04-16 22:10:50 +00001639class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001640 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001641 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001642 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001643 name = codecs.lookup(encoding).name
1644 if encoding.endswith("_codec"):
1645 name += "_codec"
1646 elif encoding == "latin_1":
1647 name = "latin_1"
1648 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001649
Ezio Melottiadc417c2011-11-17 12:23:34 +02001650 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001651 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001652 (b, size) = codecs.getencoder(encoding)(s)
1653 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1654 (chars, size) = codecs.getdecoder(encoding)(b)
1655 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001656
1657 if encoding not in broken_unicode_with_streams:
1658 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001659 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001660 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001661 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001662 for c in s:
1663 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001664 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001665 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001666 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001667 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001668 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001669 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001670 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001671 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001672 decodedresult += reader.read()
1673 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1674
Thomas Wouters89f507f2006-12-13 04:49:30 +00001675 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001676 # check incremental decoder/encoder (fetched via the Python
1677 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001678 try:
1679 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001680 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001681 except LookupError: # no IncrementalEncoder
1682 pass
1683 else:
1684 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001685 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001686 for c in s:
1687 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001688 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001689 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001690 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001691 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001692 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001693 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001694 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1695
1696 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001697 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001698 for c in s:
1699 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001700 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001701 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001702 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001703 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001704 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001705 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001706 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1707
1708 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001709 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001710 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1711
1712 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001713 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1714 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001715
Victor Stinner554f3f02010-06-16 23:33:54 +00001716 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001717 # check incremental decoder/encoder with errors argument
1718 try:
1719 encoder = codecs.getincrementalencoder(encoding)("ignore")
1720 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1721 except LookupError: # no IncrementalEncoder
1722 pass
1723 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001724 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001725 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001726 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001727 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1728
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001729 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001730 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001731 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001732 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1733
Walter Dörwald729c31f2005-03-14 19:06:30 +00001734 def test_seek(self):
1735 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001736 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001737 for encoding in all_unicode_encodings:
1738 if encoding == "idna": # FIXME: See SF bug #1163178
1739 continue
1740 if encoding in broken_unicode_with_streams:
1741 continue
Victor Stinner05010702011-05-27 16:50:40 +02001742 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001743 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001744 # Test that calling seek resets the internal codec state and buffers
1745 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001746 data = reader.read()
1747 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001748
Walter Dörwalde22d3392005-11-17 08:52:34 +00001749 def test_bad_decode_args(self):
1750 for encoding in all_unicode_encodings:
1751 decoder = codecs.getdecoder(encoding)
1752 self.assertRaises(TypeError, decoder)
1753 if encoding not in ("idna", "punycode"):
1754 self.assertRaises(TypeError, decoder, 42)
1755
1756 def test_bad_encode_args(self):
1757 for encoding in all_unicode_encodings:
1758 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001759 with support.check_warnings():
1760 # unicode-internal has been deprecated
1761 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001762
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001763 def test_encoding_map_type_initialized(self):
1764 from encodings import cp1140
1765 # This used to crash, we are only verifying there's no crash.
1766 table_type = type(cp1140.encoding_table)
1767 self.assertEqual(table_type, table_type)
1768
Walter Dörwald3abcb012007-04-16 22:10:50 +00001769 def test_decoder_state(self):
1770 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001771 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001772 for encoding in all_unicode_encodings:
1773 if encoding not in broken_incremental_coders:
1774 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1775 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1776
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001777class CharmapTest(unittest.TestCase):
1778 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001779 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001780 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001781 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001782 )
1783
Ezio Melottib3aedd42010-11-20 19:04:17 +00001784 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001785 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1786 ("\U0010FFFFbc", 3)
1787 )
1788
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001789 self.assertRaises(UnicodeDecodeError,
1790 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1791 )
1792
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001793 self.assertRaises(UnicodeDecodeError,
1794 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1795 )
1796
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001797 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001798 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001799 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001800 )
1801
Ezio Melottib3aedd42010-11-20 19:04:17 +00001802 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001803 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001804 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001805 )
1806
Ezio Melottib3aedd42010-11-20 19:04:17 +00001807 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001808 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001809 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001810 )
1811
Ezio Melottib3aedd42010-11-20 19:04:17 +00001812 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001813 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001814 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001815 )
1816
Guido van Rossum805365e2007-05-07 22:24:25 +00001817 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001818 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001819 codecs.charmap_decode(allbytes, "ignore", ""),
1820 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001821 )
1822
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001823 def test_decode_with_int2str_map(self):
1824 self.assertEqual(
1825 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1826 {0: 'a', 1: 'b', 2: 'c'}),
1827 ("abc", 3)
1828 )
1829
1830 self.assertEqual(
1831 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1832 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1833 ("AaBbCc", 3)
1834 )
1835
1836 self.assertEqual(
1837 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1838 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1839 ("\U0010FFFFbc", 3)
1840 )
1841
1842 self.assertEqual(
1843 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1844 {0: 'a', 1: 'b', 2: ''}),
1845 ("ab", 3)
1846 )
1847
1848 self.assertRaises(UnicodeDecodeError,
1849 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1850 {0: 'a', 1: 'b'}
1851 )
1852
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001853 self.assertRaises(UnicodeDecodeError,
1854 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1855 {0: 'a', 1: 'b', 2: None}
1856 )
1857
1858 # Issue #14850
1859 self.assertRaises(UnicodeDecodeError,
1860 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1861 {0: 'a', 1: 'b', 2: '\ufffe'}
1862 )
1863
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001864 self.assertEqual(
1865 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1866 {0: 'a', 1: 'b'}),
1867 ("ab\ufffd", 3)
1868 )
1869
1870 self.assertEqual(
1871 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1872 {0: 'a', 1: 'b', 2: None}),
1873 ("ab\ufffd", 3)
1874 )
1875
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001876 # Issue #14850
1877 self.assertEqual(
1878 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1879 {0: 'a', 1: 'b', 2: '\ufffe'}),
1880 ("ab\ufffd", 3)
1881 )
1882
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001883 self.assertEqual(
1884 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1885 {0: 'a', 1: 'b'}),
1886 ("ab", 3)
1887 )
1888
1889 self.assertEqual(
1890 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1891 {0: 'a', 1: 'b', 2: None}),
1892 ("ab", 3)
1893 )
1894
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001895 # Issue #14850
1896 self.assertEqual(
1897 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1898 {0: 'a', 1: 'b', 2: '\ufffe'}),
1899 ("ab", 3)
1900 )
1901
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001902 allbytes = bytes(range(256))
1903 self.assertEqual(
1904 codecs.charmap_decode(allbytes, "ignore", {}),
1905 ("", len(allbytes))
1906 )
1907
1908 def test_decode_with_int2int_map(self):
1909 a = ord('a')
1910 b = ord('b')
1911 c = ord('c')
1912
1913 self.assertEqual(
1914 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1915 {0: a, 1: b, 2: c}),
1916 ("abc", 3)
1917 )
1918
1919 # Issue #15379
1920 self.assertEqual(
1921 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1922 {0: 0x10FFFF, 1: b, 2: c}),
1923 ("\U0010FFFFbc", 3)
1924 )
1925
Antoine Pitroua1f76552012-09-23 20:00:04 +02001926 self.assertEqual(
1927 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1928 {0: sys.maxunicode, 1: b, 2: c}),
1929 (chr(sys.maxunicode) + "bc", 3)
1930 )
1931
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001932 self.assertRaises(TypeError,
1933 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001934 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001935 )
1936
1937 self.assertRaises(UnicodeDecodeError,
1938 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1939 {0: a, 1: b},
1940 )
1941
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001942 self.assertRaises(UnicodeDecodeError,
1943 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1944 {0: a, 1: b, 2: 0xFFFE},
1945 )
1946
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001947 self.assertEqual(
1948 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1949 {0: a, 1: b}),
1950 ("ab\ufffd", 3)
1951 )
1952
1953 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001954 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1955 {0: a, 1: b, 2: 0xFFFE}),
1956 ("ab\ufffd", 3)
1957 )
1958
1959 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001960 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1961 {0: a, 1: b}),
1962 ("ab", 3)
1963 )
1964
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001965 self.assertEqual(
1966 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1967 {0: a, 1: b, 2: 0xFFFE}),
1968 ("ab", 3)
1969 )
1970
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001971
Thomas Wouters89f507f2006-12-13 04:49:30 +00001972class WithStmtTest(unittest.TestCase):
1973 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001974 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001975 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1976 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001977
1978 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001979 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001980 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001981 with codecs.StreamReaderWriter(f, info.streamreader,
1982 info.streamwriter, 'strict') as srw:
1983 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001984
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001985class TypesTest(unittest.TestCase):
1986 def test_decode_unicode(self):
1987 # Most decoders don't accept unicode input
1988 decoders = [
1989 codecs.utf_7_decode,
1990 codecs.utf_8_decode,
1991 codecs.utf_16_le_decode,
1992 codecs.utf_16_be_decode,
1993 codecs.utf_16_ex_decode,
1994 codecs.utf_32_decode,
1995 codecs.utf_32_le_decode,
1996 codecs.utf_32_be_decode,
1997 codecs.utf_32_ex_decode,
1998 codecs.latin_1_decode,
1999 codecs.ascii_decode,
2000 codecs.charmap_decode,
2001 ]
2002 if hasattr(codecs, "mbcs_decode"):
2003 decoders.append(codecs.mbcs_decode)
2004 for decoder in decoders:
2005 self.assertRaises(TypeError, decoder, "xxx")
2006
2007 def test_unicode_escape(self):
2008 # Escape-decoding an unicode string is supported ang gives the same
2009 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002010 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2011 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2012 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2013 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002014
Victor Stinnere3b47152011-12-09 20:49:49 +01002015 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2016 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2017
2018 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2019 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2020
Serhiy Storchakad6793772013-01-29 10:20:44 +02002021
2022class UnicodeEscapeTest(unittest.TestCase):
2023 def test_empty(self):
2024 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2025 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2026
2027 def test_raw_encode(self):
2028 encode = codecs.unicode_escape_encode
2029 for b in range(32, 127):
2030 if b != b'\\'[0]:
2031 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2032
2033 def test_raw_decode(self):
2034 decode = codecs.unicode_escape_decode
2035 for b in range(256):
2036 if b != b'\\'[0]:
2037 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2038
2039 def test_escape_encode(self):
2040 encode = codecs.unicode_escape_encode
2041 check = coding_checker(self, encode)
2042 check('\t', br'\t')
2043 check('\n', br'\n')
2044 check('\r', br'\r')
2045 check('\\', br'\\')
2046 for b in range(32):
2047 if chr(b) not in '\t\n\r':
2048 check(chr(b), ('\\x%02x' % b).encode())
2049 for b in range(127, 256):
2050 check(chr(b), ('\\x%02x' % b).encode())
2051 check('\u20ac', br'\u20ac')
2052 check('\U0001d120', br'\U0001d120')
2053
2054 def test_escape_decode(self):
2055 decode = codecs.unicode_escape_decode
2056 check = coding_checker(self, decode)
2057 check(b"[\\\n]", "[]")
2058 check(br'[\"]', '["]')
2059 check(br"[\']", "[']")
2060 check(br"[\\]", r"[\]")
2061 check(br"[\a]", "[\x07]")
2062 check(br"[\b]", "[\x08]")
2063 check(br"[\t]", "[\x09]")
2064 check(br"[\n]", "[\x0a]")
2065 check(br"[\v]", "[\x0b]")
2066 check(br"[\f]", "[\x0c]")
2067 check(br"[\r]", "[\x0d]")
2068 check(br"[\7]", "[\x07]")
2069 check(br"[\8]", r"[\8]")
2070 check(br"[\78]", "[\x078]")
2071 check(br"[\41]", "[!]")
2072 check(br"[\418]", "[!8]")
2073 check(br"[\101]", "[A]")
2074 check(br"[\1010]", "[A0]")
2075 check(br"[\x41]", "[A]")
2076 check(br"[\x410]", "[A0]")
2077 check(br"\u20ac", "\u20ac")
2078 check(br"\U0001d120", "\U0001d120")
2079 for b in range(256):
2080 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2081 check(b'\\' + bytes([b]), '\\' + chr(b))
2082
2083 def test_decode_errors(self):
2084 decode = codecs.unicode_escape_decode
2085 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2086 for i in range(d):
2087 self.assertRaises(UnicodeDecodeError, decode,
2088 b"\\" + c + b"0"*i)
2089 self.assertRaises(UnicodeDecodeError, decode,
2090 b"[\\" + c + b"0"*i + b"]")
2091 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2092 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2093 self.assertEqual(decode(data, "replace"),
2094 ("[\ufffd]\ufffd", len(data)))
2095 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2096 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2097 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2098
2099
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002100class RawUnicodeEscapeTest(unittest.TestCase):
2101 def test_empty(self):
2102 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2103 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2104
2105 def test_raw_encode(self):
2106 encode = codecs.raw_unicode_escape_encode
2107 for b in range(256):
2108 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2109
2110 def test_raw_decode(self):
2111 decode = codecs.raw_unicode_escape_decode
2112 for b in range(256):
2113 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2114
2115 def test_escape_encode(self):
2116 encode = codecs.raw_unicode_escape_encode
2117 check = coding_checker(self, encode)
2118 for b in range(256):
2119 if b not in b'uU':
2120 check('\\' + chr(b), b'\\' + bytes([b]))
2121 check('\u20ac', br'\u20ac')
2122 check('\U0001d120', br'\U0001d120')
2123
2124 def test_escape_decode(self):
2125 decode = codecs.raw_unicode_escape_decode
2126 check = coding_checker(self, decode)
2127 for b in range(256):
2128 if b not in b'uU':
2129 check(b'\\' + bytes([b]), '\\' + chr(b))
2130 check(br"\u20ac", "\u20ac")
2131 check(br"\U0001d120", "\U0001d120")
2132
2133 def test_decode_errors(self):
2134 decode = codecs.raw_unicode_escape_decode
2135 for c, d in (b'u', 4), (b'U', 4):
2136 for i in range(d):
2137 self.assertRaises(UnicodeDecodeError, decode,
2138 b"\\" + c + b"0"*i)
2139 self.assertRaises(UnicodeDecodeError, decode,
2140 b"[\\" + c + b"0"*i + b"]")
2141 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2142 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2143 self.assertEqual(decode(data, "replace"),
2144 ("[\ufffd]\ufffd", len(data)))
2145 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2146 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2147 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2148
2149
Martin v. Löwis43c57782009-05-10 08:15:24 +00002150class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002151
2152 def test_utf8(self):
2153 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002154 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002155 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002156 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002157 b"foo\x80bar")
2158 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002159 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002160 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002161 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002162 b"\xed\xb0\x80")
2163
2164 def test_ascii(self):
2165 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002166 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002167 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002168 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002169 b"foo\x80bar")
2170
2171 def test_charmap(self):
2172 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002173 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002174 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002175 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002176 b"foo\xa5bar")
2177
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002178 def test_latin1(self):
2179 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002180 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002181 b"\xe4\xeb\xef\xf6\xfc")
2182
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002183
Victor Stinner3fed0872010-05-22 02:16:27 +00002184class BomTest(unittest.TestCase):
2185 def test_seek0(self):
2186 data = "1234567890"
2187 tests = ("utf-16",
2188 "utf-16-le",
2189 "utf-16-be",
2190 "utf-32",
2191 "utf-32-le",
2192 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002193 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002194 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002195 # Check if the BOM is written only once
2196 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002197 f.write(data)
2198 f.write(data)
2199 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002200 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002201 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002202 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002203
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002204 # Check that the BOM is written after a seek(0)
2205 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2206 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002207 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002208 f.seek(0)
2209 f.write(data)
2210 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002211 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002212
2213 # (StreamWriter) Check that the BOM is written after a seek(0)
2214 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002215 f.writer.write(data[0])
2216 self.assertNotEqual(f.writer.tell(), 0)
2217 f.writer.seek(0)
2218 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002219 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002220 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002221
Victor Stinner05010702011-05-27 16:50:40 +02002222 # Check that the BOM is not written after a seek() at a position
2223 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002224 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2225 f.write(data)
2226 f.seek(f.tell())
2227 f.write(data)
2228 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002229 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002230
Victor Stinner05010702011-05-27 16:50:40 +02002231 # (StreamWriter) Check that the BOM is not written after a seek()
2232 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002233 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002234 f.writer.write(data)
2235 f.writer.seek(f.writer.tell())
2236 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002237 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002238 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002239
Victor Stinner3fed0872010-05-22 02:16:27 +00002240
Georg Brandl02524622010-12-02 18:06:51 +00002241bytes_transform_encodings = [
2242 "base64_codec",
2243 "uu_codec",
2244 "quopri_codec",
2245 "hex_codec",
2246]
2247try:
2248 import zlib
2249except ImportError:
2250 pass
2251else:
2252 bytes_transform_encodings.append("zlib_codec")
2253try:
2254 import bz2
2255except ImportError:
2256 pass
2257else:
2258 bytes_transform_encodings.append("bz2_codec")
2259
2260class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002261
Georg Brandl02524622010-12-02 18:06:51 +00002262 def test_basics(self):
2263 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002264 for encoding in bytes_transform_encodings:
2265 # generic codecs interface
2266 (o, size) = codecs.getencoder(encoding)(binput)
2267 self.assertEqual(size, len(binput))
2268 (i, size) = codecs.getdecoder(encoding)(o)
2269 self.assertEqual(size, len(o))
2270 self.assertEqual(i, binput)
2271
Georg Brandl02524622010-12-02 18:06:51 +00002272 def test_read(self):
2273 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002274 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002275 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002276 sout = reader.read()
2277 self.assertEqual(sout, b"\x80")
2278
2279 def test_readline(self):
2280 for encoding in bytes_transform_encodings:
2281 if encoding in ['uu_codec', 'zlib_codec']:
2282 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002283 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002284 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002285 sout = reader.readline()
2286 self.assertEqual(sout, b"\x80")
2287
2288
Victor Stinner62be4fb2011-10-18 21:46:37 +02002289@unittest.skipUnless(sys.platform == 'win32',
2290 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002291class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002292 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002293 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002294
Victor Stinner3a50e702011-10-18 21:21:00 +02002295 def test_invalid_code_page(self):
2296 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2297 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002298 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2299 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002300
2301 def test_code_page_name(self):
2302 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2303 codecs.code_page_encode, 932, '\xff')
2304 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2305 codecs.code_page_decode, 932, b'\x81\x00')
2306 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2307 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2308
2309 def check_decode(self, cp, tests):
2310 for raw, errors, expected in tests:
2311 if expected is not None:
2312 try:
2313 decoded = codecs.code_page_decode(cp, raw, errors)
2314 except UnicodeDecodeError as err:
2315 self.fail('Unable to decode %a from "cp%s" with '
2316 'errors=%r: %s' % (raw, cp, errors, err))
2317 self.assertEqual(decoded[0], expected,
2318 '%a.decode("cp%s", %r)=%a != %a'
2319 % (raw, cp, errors, decoded[0], expected))
2320 # assert 0 <= decoded[1] <= len(raw)
2321 self.assertGreaterEqual(decoded[1], 0)
2322 self.assertLessEqual(decoded[1], len(raw))
2323 else:
2324 self.assertRaises(UnicodeDecodeError,
2325 codecs.code_page_decode, cp, raw, errors)
2326
2327 def check_encode(self, cp, tests):
2328 for text, errors, expected in tests:
2329 if expected is not None:
2330 try:
2331 encoded = codecs.code_page_encode(cp, text, errors)
2332 except UnicodeEncodeError as err:
2333 self.fail('Unable to encode %a to "cp%s" with '
2334 'errors=%r: %s' % (text, cp, errors, err))
2335 self.assertEqual(encoded[0], expected,
2336 '%a.encode("cp%s", %r)=%a != %a'
2337 % (text, cp, errors, encoded[0], expected))
2338 self.assertEqual(encoded[1], len(text))
2339 else:
2340 self.assertRaises(UnicodeEncodeError,
2341 codecs.code_page_encode, cp, text, errors)
2342
2343 def test_cp932(self):
2344 self.check_encode(932, (
2345 ('abc', 'strict', b'abc'),
2346 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002347 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002348 ('\xff', 'strict', None),
2349 ('[\xff]', 'ignore', b'[]'),
2350 ('[\xff]', 'replace', b'[y]'),
2351 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002352 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2353 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002354 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002355 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002356 (b'abc', 'strict', 'abc'),
2357 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2358 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002359 (b'[\xff]', 'strict', None),
2360 (b'[\xff]', 'ignore', '[]'),
2361 (b'[\xff]', 'replace', '[\ufffd]'),
2362 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002363 (b'\x81\x00abc', 'strict', None),
2364 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002365 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2366 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002367
2368 def test_cp1252(self):
2369 self.check_encode(1252, (
2370 ('abc', 'strict', b'abc'),
2371 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2372 ('\xff', 'strict', b'\xff'),
2373 ('\u0141', 'strict', None),
2374 ('\u0141', 'ignore', b''),
2375 ('\u0141', 'replace', b'L'),
2376 ))
2377 self.check_decode(1252, (
2378 (b'abc', 'strict', 'abc'),
2379 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2380 (b'\xff', 'strict', '\xff'),
2381 ))
2382
2383 def test_cp_utf7(self):
2384 cp = 65000
2385 self.check_encode(cp, (
2386 ('abc', 'strict', b'abc'),
2387 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2388 ('\U0010ffff', 'strict', b'+2//f/w-'),
2389 ('\udc80', 'strict', b'+3IA-'),
2390 ('\ufffd', 'strict', b'+//0-'),
2391 ))
2392 self.check_decode(cp, (
2393 (b'abc', 'strict', 'abc'),
2394 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2395 (b'+2//f/w-', 'strict', '\U0010ffff'),
2396 (b'+3IA-', 'strict', '\udc80'),
2397 (b'+//0-', 'strict', '\ufffd'),
2398 # invalid bytes
2399 (b'[+/]', 'strict', '[]'),
2400 (b'[\xff]', 'strict', '[\xff]'),
2401 ))
2402
Victor Stinner3a50e702011-10-18 21:21:00 +02002403 def test_multibyte_encoding(self):
2404 self.check_decode(932, (
2405 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2406 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2407 ))
2408 self.check_decode(self.CP_UTF8, (
2409 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2410 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2411 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002412 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002413 self.check_encode(self.CP_UTF8, (
2414 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2415 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2416 ))
2417
2418 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002419 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2420 self.assertEqual(decoded, ('', 0))
2421
Victor Stinner3a50e702011-10-18 21:21:00 +02002422 decoded = codecs.code_page_decode(932,
2423 b'\xe9\x80\xe9', 'strict',
2424 False)
2425 self.assertEqual(decoded, ('\u9a3e', 2))
2426
2427 decoded = codecs.code_page_decode(932,
2428 b'\xe9\x80\xe9\x80', 'strict',
2429 False)
2430 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2431
2432 decoded = codecs.code_page_decode(932,
2433 b'abc', 'strict',
2434 False)
2435 self.assertEqual(decoded, ('abc', 3))
2436
2437
Fred Drake2e2be372001-09-20 21:33:42 +00002438if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002439 unittest.main()