blob: 35170579a78cd4d5c6154eda41fe73037f8b5aa2 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Serhiy Storchakad6793772013-01-29 10:20:44 +020024def coding_checker(self, coder):
25 def check(input, expect):
26 self.assertEqual(coder(input), (expect, len(input)))
27 return check
28
Walter Dörwald69652032004-09-07 20:24:22 +000029class Queue(object):
30 """
31 queue: write bytes at one end, read bytes from the other end
32 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000033 def __init__(self, buffer):
34 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000035
36 def write(self, chars):
37 self._buffer += chars
38
39 def read(self, size=-1):
40 if size<0:
41 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000043 return s
44 else:
45 s = self._buffer[:size]
46 self._buffer = self._buffer[size:]
47 return s
48
Walter Dörwald3abcb012007-04-16 22:10:50 +000049class MixInCheckStateHandling:
50 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementaldecoder(encoding)()
53 part1 = d.decode(s[:i])
54 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000055 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000056 # Check that the condition stated in the documentation for
57 # IncrementalDecoder.getstate() holds
58 if not state[1]:
59 # reset decoder to the default state without anything buffered
60 d.setstate((state[0][:0], 0))
61 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000062 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000063 # The decoder must return to the same state
64 self.assertEqual(state, d.getstate())
65 # Create a new decoder and set it to the state
66 # we extracted from the old one
67 d = codecs.getincrementaldecoder(encoding)()
68 d.setstate(state)
69 part2 = d.decode(s[i:], True)
70 self.assertEqual(u, part1+part2)
71
72 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000073 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000074 d = codecs.getincrementalencoder(encoding)()
75 part1 = d.encode(u[:i])
76 state = d.getstate()
77 d = codecs.getincrementalencoder(encoding)()
78 d.setstate(state)
79 part2 = d.encode(u[i:], True)
80 self.assertEqual(s, part1+part2)
81
Ezio Melotti5d3dba02013-01-11 06:02:07 +020082class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000084 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000085 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000086 # the StreamReader and check that the results equal the appropriate
87 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020089 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000093 result += r.read()
94 self.assertEqual(result, partialresult)
95 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000097 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000098
Thomas Woutersa9773292006-04-21 09:43:23 +000099 # do the check again, this time using a incremental decoder
100 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000101 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000103 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 self.assertEqual(result, partialresult)
105 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 self.assertEqual(d.decode(b"", True), "")
107 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000108
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000109 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000111 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 self.assertEqual(result, partialresult)
115 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 self.assertEqual(d.decode(b"", True), "")
117 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000118
119 # check iterdecode()
120 encoded = input.encode(self.encoding)
121 self.assertEqual(
122 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000123 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 )
125
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126 def test_readline(self):
127 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000128 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129 return codecs.getreader(self.encoding)(stream)
130
Walter Dörwaldca199432006-03-06 22:39:12 +0000131 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200132 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 lines = []
134 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 if not line:
137 break
138 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000141 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
142 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
143 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000144 self.assertEqual(readalllines(s, True), sexpected)
145 self.assertEqual(readalllines(s, False), sexpectednoends)
146 self.assertEqual(readalllines(s, True, 10), sexpected)
147 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148
149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000152 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
153 vw.append((i*200)*"\3042" + lineend)
154 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000155 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
156 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 for lineend in "\n \r\n \r \u2028".split():
162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
169 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000170 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000171 self.assertEqual(
172 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000173 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 )
175
176 def test_bug1175396(self):
177 s = [
178 '<%!--===================================================\r\n',
179 ' BLOG index page: show recent articles,\r\n',
180 ' today\'s articles, or articles of a specific date.\r\n',
181 '========================================================--%>\r\n',
182 '<%@inputencoding="ISO-8859-1"%>\r\n',
183 '<%@pagetemplate=TEMPLATE.y%>\r\n',
184 '<%@import=import frog.util, frog%>\r\n',
185 '<%@import=import frog.objects%>\r\n',
186 '<%@import=from frog.storageerrors import StorageError%>\r\n',
187 '<%\r\n',
188 '\r\n',
189 'import logging\r\n',
190 'log=logging.getLogger("Snakelets.logger")\r\n',
191 '\r\n',
192 '\r\n',
193 'user=self.SessionCtx.user\r\n',
194 'storageEngine=self.SessionCtx.storageEngine\r\n',
195 '\r\n',
196 '\r\n',
197 'def readArticlesFromDate(date, count=None):\r\n',
198 ' entryids=storageEngine.listBlogEntries(date)\r\n',
199 ' entryids.reverse() # descending\r\n',
200 ' if count:\r\n',
201 ' entryids=entryids[:count]\r\n',
202 ' try:\r\n',
203 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
204 ' except StorageError,x:\r\n',
205 ' log.error("Error loading articles: "+str(x))\r\n',
206 ' self.abort("cannot load articles")\r\n',
207 '\r\n',
208 'showdate=None\r\n',
209 '\r\n',
210 'arg=self.Request.getArg()\r\n',
211 'if arg=="today":\r\n',
212 ' #-------------------- TODAY\'S ARTICLES\r\n',
213 ' self.write("<h2>Today\'s articles</h2>")\r\n',
214 ' showdate = frog.util.isodatestr() \r\n',
215 ' entries = readArticlesFromDate(showdate)\r\n',
216 'elif arg=="active":\r\n',
217 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
218 ' self.Yredirect("active.y")\r\n',
219 'elif arg=="login":\r\n',
220 ' #-------------------- LOGIN PAGE redirect\r\n',
221 ' self.Yredirect("login.y")\r\n',
222 'elif arg=="date":\r\n',
223 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
224 ' showdate = self.Request.getParameter("date")\r\n',
225 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
226 ' entries = readArticlesFromDate(showdate)\r\n',
227 'else:\r\n',
228 ' #-------------------- RECENT ARTICLES\r\n',
229 ' self.write("<h2>Recent articles</h2>")\r\n',
230 ' dates=storageEngine.listBlogEntryDates()\r\n',
231 ' if dates:\r\n',
232 ' entries=[]\r\n',
233 ' SHOWAMOUNT=10\r\n',
234 ' for showdate in dates:\r\n',
235 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
236 ' if len(entries)>=SHOWAMOUNT:\r\n',
237 ' break\r\n',
238 ' \r\n',
239 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000240 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200241 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000242 for (i, line) in enumerate(reader):
243 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000246 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200247 writer = codecs.getwriter(self.encoding)(q)
248 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
250 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 writer.write("foo\r")
252 self.assertEqual(reader.readline(keepends=False), "foo")
253 writer.write("\nbar\r")
254 self.assertEqual(reader.readline(keepends=False), "")
255 self.assertEqual(reader.readline(keepends=False), "bar")
256 writer.write("baz")
257 self.assertEqual(reader.readline(keepends=False), "baz")
258 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259
260 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 writer.write("foo\r")
262 self.assertEqual(reader.readline(keepends=True), "foo\r")
263 writer.write("\nbar\r")
264 self.assertEqual(reader.readline(keepends=True), "\n")
265 self.assertEqual(reader.readline(keepends=True), "bar\r")
266 writer.write("baz")
267 self.assertEqual(reader.readline(keepends=True), "baz")
268 self.assertEqual(reader.readline(keepends=True), "")
269 writer.write("foo\r\n")
270 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000271
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000273 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
274 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
275 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000276
277 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000278 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200279 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000280 self.assertEqual(reader.readline(), s1)
281 self.assertEqual(reader.readline(), s2)
282 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000283 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000284
285 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000286 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
287 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
288 s3 = "stillokay:bbbbxx\r\n"
289 s4 = "broken!!!!badbad\r\n"
290 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000291
292 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000293 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200294 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000295 self.assertEqual(reader.readline(), s1)
296 self.assertEqual(reader.readline(), s2)
297 self.assertEqual(reader.readline(), s3)
298 self.assertEqual(reader.readline(), s4)
299 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000300 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000301
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200302class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303 encoding = "utf-32"
304
305 spamle = (b'\xff\xfe\x00\x00'
306 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
307 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
308 spambe = (b'\x00\x00\xfe\xff'
309 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
310 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
311
312 def test_only_one_bom(self):
313 _,_,reader,writer = codecs.lookup(self.encoding)
314 # encode some stream
315 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200316 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 f.write("spam")
318 f.write("spam")
319 d = s.getvalue()
320 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000321 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000322 # try to read it back
323 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000325 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000326
327 def test_badbom(self):
328 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200329 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000330 self.assertRaises(UnicodeError, f.read)
331
332 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200333 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 self.assertRaises(UnicodeError, f.read)
335
336 def test_partial(self):
337 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200338 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000339 [
340 "", # first byte of BOM read
341 "", # second byte of BOM read
342 "", # third byte of BOM read
343 "", # fourth byte of BOM read => byteorder known
344 "",
345 "",
346 "",
347 "\x00",
348 "\x00",
349 "\x00",
350 "\x00",
351 "\x00\xff",
352 "\x00\xff",
353 "\x00\xff",
354 "\x00\xff",
355 "\x00\xff\u0100",
356 "\x00\xff\u0100",
357 "\x00\xff\u0100",
358 "\x00\xff\u0100",
359 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200360 "\x00\xff\u0100\uffff",
361 "\x00\xff\u0100\uffff",
362 "\x00\xff\u0100\uffff",
363 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 ]
365 )
366
Georg Brandl791f4e12009-09-17 11:41:24 +0000367 def test_handlers(self):
368 self.assertEqual(('\ufffd', 1),
369 codecs.utf_32_decode(b'\x01', 'replace', True))
370 self.assertEqual(('', 1),
371 codecs.utf_32_decode(b'\x01', 'ignore', True))
372
Walter Dörwald41980ca2007-08-16 21:55:45 +0000373 def test_errors(self):
374 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
375 b"\xff", "strict", True)
376
377 def test_decoder_state(self):
378 self.check_state_handling_decode(self.encoding,
379 "spamspam", self.spamle)
380 self.check_state_handling_decode(self.encoding,
381 "spamspam", self.spambe)
382
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000383 def test_issue8941(self):
384 # Issue #8941: insufficient result allocation when decoding into
385 # surrogate pairs on UCS-2 builds.
386 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
387 self.assertEqual('\U00010000' * 1024,
388 codecs.utf_32_decode(encoded_le)[0])
389 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
390 self.assertEqual('\U00010000' * 1024,
391 codecs.utf_32_decode(encoded_be)[0])
392
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200393class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000394 encoding = "utf-32-le"
395
396 def test_partial(self):
397 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200398 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 [
400 "",
401 "",
402 "",
403 "\x00",
404 "\x00",
405 "\x00",
406 "\x00",
407 "\x00\xff",
408 "\x00\xff",
409 "\x00\xff",
410 "\x00\xff",
411 "\x00\xff\u0100",
412 "\x00\xff\u0100",
413 "\x00\xff\u0100",
414 "\x00\xff\u0100",
415 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200416 "\x00\xff\u0100\uffff",
417 "\x00\xff\u0100\uffff",
418 "\x00\xff\u0100\uffff",
419 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420 ]
421 )
422
423 def test_simple(self):
424 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
425
426 def test_errors(self):
427 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
428 b"\xff", "strict", True)
429
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000430 def test_issue8941(self):
431 # Issue #8941: insufficient result allocation when decoding into
432 # surrogate pairs on UCS-2 builds.
433 encoded = b'\x00\x00\x01\x00' * 1024
434 self.assertEqual('\U00010000' * 1024,
435 codecs.utf_32_le_decode(encoded)[0])
436
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200437class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 encoding = "utf-32-be"
439
440 def test_partial(self):
441 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000443 [
444 "",
445 "",
446 "",
447 "\x00",
448 "\x00",
449 "\x00",
450 "\x00",
451 "\x00\xff",
452 "\x00\xff",
453 "\x00\xff",
454 "\x00\xff",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100",
458 "\x00\xff\u0100",
459 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200460 "\x00\xff\u0100\uffff",
461 "\x00\xff\u0100\uffff",
462 "\x00\xff\u0100\uffff",
463 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000464 ]
465 )
466
467 def test_simple(self):
468 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
469
470 def test_errors(self):
471 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
472 b"\xff", "strict", True)
473
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000474 def test_issue8941(self):
475 # Issue #8941: insufficient result allocation when decoding into
476 # surrogate pairs on UCS-2 builds.
477 encoded = b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_be_decode(encoded)[0])
480
481
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200482class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000484
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000485 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
486 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000487
488 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000489 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000493 f.write("spam")
494 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000495 d = s.getvalue()
496 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000497 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000498 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000499 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200500 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000501 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000502
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000503 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000504 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200505 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000506 self.assertRaises(UnicodeError, f.read)
507
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000508 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200509 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000510 self.assertRaises(UnicodeError, f.read)
511
Walter Dörwald69652032004-09-07 20:24:22 +0000512 def test_partial(self):
513 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200514 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000515 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 "", # first byte of BOM read
517 "", # second byte of BOM read => byteorder known
518 "",
519 "\x00",
520 "\x00",
521 "\x00\xff",
522 "\x00\xff",
523 "\x00\xff\u0100",
524 "\x00\xff\u0100",
525 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff",
527 "\x00\xff\u0100\uffff",
528 "\x00\xff\u0100\uffff",
529 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 ]
531 )
532
Georg Brandl791f4e12009-09-17 11:41:24 +0000533 def test_handlers(self):
534 self.assertEqual(('\ufffd', 1),
535 codecs.utf_16_decode(b'\x01', 'replace', True))
536 self.assertEqual(('', 1),
537 codecs.utf_16_decode(b'\x01', 'ignore', True))
538
Walter Dörwalde22d3392005-11-17 08:52:34 +0000539 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000540 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000541 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000542
543 def test_decoder_state(self):
544 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000545 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000546 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000548
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000549 def test_bug691291(self):
550 # Files are always opened in binary mode, even if no binary mode was
551 # specified. This means that no automatic conversion of '\n' is done
552 # on reading and writing.
553 s1 = 'Hello\r\nworld\r\n'
554
555 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200556 self.addCleanup(support.unlink, support.TESTFN)
557 with open(support.TESTFN, 'wb') as fp:
558 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200559 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200560 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000561
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200562class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000564
565 def test_partial(self):
566 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200567 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000568 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 "",
570 "\x00",
571 "\x00",
572 "\x00\xff",
573 "\x00\xff",
574 "\x00\xff\u0100",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200577 "\x00\xff\u0100\uffff",
578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff",
580 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000581 ]
582 )
583
Walter Dörwalde22d3392005-11-17 08:52:34 +0000584 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200585 tests = [
586 (b'\xff', '\ufffd'),
587 (b'A\x00Z', 'A\ufffd'),
588 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
589 (b'\x00\xd8', '\ufffd'),
590 (b'\x00\xd8A', '\ufffd'),
591 (b'\x00\xd8A\x00', '\ufffdA'),
592 (b'\x00\xdcA\x00', '\ufffdA'),
593 ]
594 for raw, expected in tests:
595 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
596 raw, 'strict', True)
597 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598
Victor Stinner53a9dd72010-12-08 22:25:45 +0000599 def test_nonbmp(self):
600 self.assertEqual("\U00010203".encode(self.encoding),
601 b'\x00\xd8\x03\xde')
602 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
603 "\U00010203")
604
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200605class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000606 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000607
608 def test_partial(self):
609 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200610 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000611 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200628 tests = [
629 (b'\xff', '\ufffd'),
630 (b'\x00A\xff', 'A\ufffd'),
631 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
632 (b'\xd8\x00', '\ufffd'),
633 (b'\xd8\x00\xdc', '\ufffd'),
634 (b'\xd8\x00\x00A', '\ufffdA'),
635 (b'\xdc\x00\x00A', '\ufffdA'),
636 ]
637 for raw, expected in tests:
638 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
639 raw, 'strict', True)
640 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000641
Victor Stinner53a9dd72010-12-08 22:25:45 +0000642 def test_nonbmp(self):
643 self.assertEqual("\U00010203".encode(self.encoding),
644 b'\xd8\x00\xde\x03')
645 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
646 "\U00010203")
647
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200648class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000649 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000650
651 def test_partial(self):
652 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200653 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000654 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000655 "\x00",
656 "\x00",
657 "\x00\xff",
658 "\x00\xff",
659 "\x00\xff\u07ff",
660 "\x00\xff\u07ff",
661 "\x00\xff\u07ff",
662 "\x00\xff\u07ff\u0800",
663 "\x00\xff\u07ff\u0800",
664 "\x00\xff\u07ff\u0800",
665 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200666 "\x00\xff\u07ff\u0800\uffff",
667 "\x00\xff\u07ff\u0800\uffff",
668 "\x00\xff\u07ff\u0800\uffff",
669 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000670 ]
671 )
672
Walter Dörwald3abcb012007-04-16 22:10:50 +0000673 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000674 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000675 self.check_state_handling_decode(self.encoding,
676 u, u.encode(self.encoding))
677
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000678 def test_lone_surrogates(self):
679 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
680 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000681 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
682 b'[\\udc80]')
683 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
684 b'[&#56448;]')
685 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
686 b'[\x80]')
687 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
688 b'[]')
689 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
690 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000691
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000692 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000693 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
694 b"abc\xed\xa0\x80def")
695 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
696 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
698 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
699 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
700 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000701 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700702 with self.assertRaises(UnicodeDecodeError):
703 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200704 with self.assertRaises(UnicodeDecodeError):
705 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000706
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200707@unittest.skipUnless(sys.platform == 'win32',
708 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200709class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200710 encoding = "cp65001"
711
712 def test_encode(self):
713 tests = [
714 ('abc', 'strict', b'abc'),
715 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
716 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
717 ]
718 if VISTA_OR_LATER:
719 tests.extend((
720 ('\udc80', 'strict', None),
721 ('\udc80', 'ignore', b''),
722 ('\udc80', 'replace', b'?'),
723 ('\udc80', 'backslashreplace', b'\\udc80'),
724 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
725 ))
726 else:
727 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
728 for text, errors, expected in tests:
729 if expected is not None:
730 try:
731 encoded = text.encode('cp65001', errors)
732 except UnicodeEncodeError as err:
733 self.fail('Unable to encode %a to cp65001 with '
734 'errors=%r: %s' % (text, errors, err))
735 self.assertEqual(encoded, expected,
736 '%a.encode("cp65001", %r)=%a != %a'
737 % (text, errors, encoded, expected))
738 else:
739 self.assertRaises(UnicodeEncodeError,
740 text.encode, "cp65001", errors)
741
742 def test_decode(self):
743 tests = [
744 (b'abc', 'strict', 'abc'),
745 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
746 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
747 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
748 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
749 # invalid bytes
750 (b'[\xff]', 'strict', None),
751 (b'[\xff]', 'ignore', '[]'),
752 (b'[\xff]', 'replace', '[\ufffd]'),
753 (b'[\xff]', 'surrogateescape', '[\udcff]'),
754 ]
755 if VISTA_OR_LATER:
756 tests.extend((
757 (b'[\xed\xb2\x80]', 'strict', None),
758 (b'[\xed\xb2\x80]', 'ignore', '[]'),
759 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
760 ))
761 else:
762 tests.extend((
763 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
764 ))
765 for raw, errors, expected in tests:
766 if expected is not None:
767 try:
768 decoded = raw.decode('cp65001', errors)
769 except UnicodeDecodeError as err:
770 self.fail('Unable to decode %a from cp65001 with '
771 'errors=%r: %s' % (raw, errors, err))
772 self.assertEqual(decoded, expected,
773 '%a.decode("cp65001", %r)=%a != %a'
774 % (raw, errors, decoded, expected))
775 else:
776 self.assertRaises(UnicodeDecodeError,
777 raw.decode, 'cp65001', errors)
778
779 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
780 def test_lone_surrogates(self):
781 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
782 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
783 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
784 b'[\\udc80]')
785 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
786 b'[&#56448;]')
787 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
788 b'[\x80]')
789 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
790 b'[]')
791 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
792 b'[?]')
793
794 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
795 def test_surrogatepass_handler(self):
796 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
797 b"abc\xed\xa0\x80def")
798 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
799 "abc\ud800def")
800 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
801 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
802 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
803 "\U00010fff\uD800")
804 self.assertTrue(codecs.lookup_error("surrogatepass"))
805
806
807
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200808class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000809 encoding = "utf-7"
810
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000811 def test_partial(self):
812 self.check_partial(
813 "a+-b",
814 [
815 "a",
816 "a",
817 "a+",
818 "a+-",
819 "a+-b",
820 ]
821 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000822
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300823 def test_errors(self):
824 tests = [
825 (b'a\xffb', 'a\ufffdb'),
826 (b'a+IK', 'a\ufffd'),
827 (b'a+IK-b', 'a\ufffdb'),
828 (b'a+IK,b', 'a\ufffdb'),
829 (b'a+IKx', 'a\u20ac\ufffd'),
830 (b'a+IKx-b', 'a\u20ac\ufffdb'),
831 (b'a+IKwgr', 'a\u20ac\ufffd'),
832 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
833 (b'a+IKwgr,', 'a\u20ac\ufffd'),
834 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
835 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
836 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
837 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
838 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
839 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
840 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
841 ]
842 for raw, expected in tests:
Serhiy Storchaka0e071c92013-10-19 21:14:57 +0300843 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
844 raw, 'strict', True)
845 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300846
847 def test_nonbmp(self):
848 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
849 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
850 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
851
Walter Dörwalde22d3392005-11-17 08:52:34 +0000852class UTF16ExTest(unittest.TestCase):
853
854 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000855 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000856
857 def test_bad_args(self):
858 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
859
860class ReadBufferTest(unittest.TestCase):
861
862 def test_array(self):
863 import array
864 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000865 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000866 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000867 )
868
869 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000870 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000871
872 def test_bad_args(self):
873 self.assertRaises(TypeError, codecs.readbuffer_encode)
874 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
875
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200876class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000877 encoding = "utf-8-sig"
878
879 def test_partial(self):
880 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200881 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000882 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000883 "",
884 "",
885 "", # First BOM has been read and skipped
886 "",
887 "",
888 "\ufeff", # Second BOM has been read and emitted
889 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000890 "\ufeff\x00", # First byte of encoded "\xff" read
891 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
892 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
893 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000894 "\ufeff\x00\xff\u07ff",
895 "\ufeff\x00\xff\u07ff",
896 "\ufeff\x00\xff\u07ff\u0800",
897 "\ufeff\x00\xff\u07ff\u0800",
898 "\ufeff\x00\xff\u07ff\u0800",
899 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200900 "\ufeff\x00\xff\u07ff\u0800\uffff",
901 "\ufeff\x00\xff\u07ff\u0800\uffff",
902 "\ufeff\x00\xff\u07ff\u0800\uffff",
903 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000904 ]
905 )
906
Thomas Wouters89f507f2006-12-13 04:49:30 +0000907 def test_bug1601501(self):
908 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000909 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000910
Walter Dörwald3abcb012007-04-16 22:10:50 +0000911 def test_bom(self):
912 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000913 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000914 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
915
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000916 def test_stream_bom(self):
917 unistring = "ABC\u00A1\u2200XYZ"
918 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
919
920 reader = codecs.getreader("utf-8-sig")
921 for sizehint in [None] + list(range(1, 11)) + \
922 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200923 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000924 ostream = io.StringIO()
925 while 1:
926 if sizehint is not None:
927 data = istream.read(sizehint)
928 else:
929 data = istream.read()
930
931 if not data:
932 break
933 ostream.write(data)
934
935 got = ostream.getvalue()
936 self.assertEqual(got, unistring)
937
938 def test_stream_bare(self):
939 unistring = "ABC\u00A1\u2200XYZ"
940 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
941
942 reader = codecs.getreader("utf-8-sig")
943 for sizehint in [None] + list(range(1, 11)) + \
944 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200945 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000946 ostream = io.StringIO()
947 while 1:
948 if sizehint is not None:
949 data = istream.read(sizehint)
950 else:
951 data = istream.read()
952
953 if not data:
954 break
955 ostream.write(data)
956
957 got = ostream.getvalue()
958 self.assertEqual(got, unistring)
959
960class EscapeDecodeTest(unittest.TestCase):
961 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200962 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000963
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200964 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200965 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200966 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200967 b = bytes([b])
968 if b != b'\\':
969 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200970
971 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200972 decode = codecs.escape_decode
973 check = coding_checker(self, decode)
974 check(b"[\\\n]", b"[]")
975 check(br'[\"]', b'["]')
976 check(br"[\']", b"[']")
977 check(br"[\\]", br"[\]")
978 check(br"[\a]", b"[\x07]")
979 check(br"[\b]", b"[\x08]")
980 check(br"[\t]", b"[\x09]")
981 check(br"[\n]", b"[\x0a]")
982 check(br"[\v]", b"[\x0b]")
983 check(br"[\f]", b"[\x0c]")
984 check(br"[\r]", b"[\x0d]")
985 check(br"[\7]", b"[\x07]")
986 check(br"[\8]", br"[\8]")
987 check(br"[\78]", b"[\x078]")
988 check(br"[\41]", b"[!]")
989 check(br"[\418]", b"[!8]")
990 check(br"[\101]", b"[A]")
991 check(br"[\1010]", b"[A0]")
992 check(br"[\501]", b"[A]")
993 check(br"[\x41]", b"[A]")
994 check(br"[\X41]", br"[\X41]")
995 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200996 for b in range(256):
997 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200998 b = bytes([b])
999 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001000
1001 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001002 decode = codecs.escape_decode
1003 self.assertRaises(ValueError, decode, br"\x")
1004 self.assertRaises(ValueError, decode, br"[\x]")
1005 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1006 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1007 self.assertRaises(ValueError, decode, br"\x0")
1008 self.assertRaises(ValueError, decode, br"[\x0]")
1009 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1010 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001011
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001012class RecodingTest(unittest.TestCase):
1013 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001014 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001015 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001016 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001017 f2.close()
1018 # Python used to crash on this at exit because of a refcount
1019 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001020
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021# From RFC 3492
1022punycode_testcases = [
1023 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001024 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1025 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001026 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001028 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001029 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001030 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001031 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001032 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001033 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1035 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1036 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001037 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001039 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1040 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1041 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001042 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001043 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001044 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001045 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1046 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1047 "\u0939\u0948\u0902",
1048 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001049
1050 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001051 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001052 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1053 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054
1055 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001056 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1057 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1058 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001059 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1060 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061
1062 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001063 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1064 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1065 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1066 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001067 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068
1069 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1071 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1072 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1073 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1074 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001075 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076
1077 # (K) Vietnamese:
1078 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1079 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001080 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1081 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1082 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1083 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001084 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001087 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001088 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001089
Martin v. Löwis2548c732003-04-18 10:39:54 +00001090 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001091 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1092 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1093 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001094 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095
1096 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001097 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1098 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1099 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001100 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101
1102 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001103 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001104 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001105
1106 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001107 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1108 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001109 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110
1111 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001112 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001113 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001114
1115 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001116 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001117 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118
1119 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001120 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1121 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001122 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123 ]
1124
1125for i in punycode_testcases:
1126 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001127 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001128
1129class PunycodeTest(unittest.TestCase):
1130 def test_encode(self):
1131 for uni, puny in punycode_testcases:
1132 # Need to convert both strings to lower case, since
1133 # some of the extended encodings use upper case, but our
1134 # code produces only lower case. Converting just puny to
1135 # lower is also insufficient, since some of the input characters
1136 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001137 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001138 str(uni.encode("punycode"), "ascii").lower(),
1139 str(puny, "ascii").lower()
1140 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141
1142 def test_decode(self):
1143 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001144 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001145 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001146 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001147
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001148class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001149 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001150 def test_bug1251300(self):
1151 # Decoding with unicode_internal used to not correctly handle "code
1152 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001153 ok = [
1154 (b"\x00\x10\xff\xff", "\U0010ffff"),
1155 (b"\x00\x00\x01\x01", "\U00000101"),
1156 (b"", ""),
1157 ]
1158 not_ok = [
1159 b"\x7f\xff\xff\xff",
1160 b"\x80\x00\x00\x00",
1161 b"\x81\x00\x00\x00",
1162 b"\x00",
1163 b"\x00\x00\x00\x00\x00",
1164 ]
1165 for internal, uni in ok:
1166 if sys.byteorder == "little":
1167 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001168 with support.check_warnings():
1169 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001170 for internal in not_ok:
1171 if sys.byteorder == "little":
1172 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001173 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001174 'deprecated', DeprecationWarning)):
1175 self.assertRaises(UnicodeDecodeError, internal.decode,
1176 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001177 if sys.byteorder == "little":
1178 invalid = b"\x00\x00\x11\x00"
1179 else:
1180 invalid = b"\x00\x11\x00\x00"
1181 with support.check_warnings():
1182 self.assertRaises(UnicodeDecodeError,
1183 invalid.decode, "unicode_internal")
1184 with support.check_warnings():
1185 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1186 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001187
Victor Stinner182d90d2011-09-29 19:53:55 +02001188 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001189 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001190 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001191 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001192 'deprecated', DeprecationWarning)):
1193 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001194 except UnicodeDecodeError as ex:
1195 self.assertEqual("unicode_internal", ex.encoding)
1196 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1197 self.assertEqual(4, ex.start)
1198 self.assertEqual(8, ex.end)
1199 else:
1200 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001201
Victor Stinner182d90d2011-09-29 19:53:55 +02001202 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001203 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001204 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1205 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001206 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001207 'deprecated', DeprecationWarning)):
1208 ab = "ab".encode("unicode_internal").decode()
1209 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1210 "ascii"),
1211 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001212 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001213
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001214 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001215 with support.check_warnings(('unicode_internal codec has been '
1216 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001217 # Issue 3739
1218 encoder = codecs.getencoder("unicode_internal")
1219 self.assertEqual(encoder("a")[1], 1)
1220 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1221
1222 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001223
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1225nameprep_tests = [
1226 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001227 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1228 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1229 b'\xb8\x8f\xef\xbb\xbf',
1230 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001232 (b'CAFE',
1233 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234 # 3.3 Case folding 8bit U+00DF (german sharp s).
1235 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001236 (b'\xc3\x9f',
1237 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001239 (b'\xc4\xb0',
1240 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001242 (b'\xc5\x83\xcd\xba',
1243 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001244 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1245 # XXX: skip this as it fails in UCS-2 mode
1246 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1247 # 'telc\xe2\x88\x95kg\xcf\x83'),
1248 (None, None),
1249 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001250 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1251 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001253 (b'\xe1\xbe\xb7',
1254 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001255 # 3.9 Self-reverting case folding U+01F0 and normalization.
1256 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001257 (b'\xc7\xb0',
1258 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001260 (b'\xce\x90',
1261 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001263 (b'\xce\xb0',
1264 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001266 (b'\xe1\xba\x96',
1267 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001268 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001269 (b'\xe1\xbd\x96',
1270 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001272 (b' ',
1273 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001275 (b'\xc2\xa0',
1276 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001277 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001278 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001279 None),
1280 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001281 (b'\xe2\x80\x80',
1282 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001284 (b'\xe2\x80\x8b',
1285 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001287 (b'\xe3\x80\x80',
1288 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001289 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001290 (b'\x10\x7f',
1291 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001293 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001294 None),
1295 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001296 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001297 None),
1298 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001299 (b'\xef\xbb\xbf',
1300 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001302 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303 None),
1304 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001305 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001306 None),
1307 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001308 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309 None),
1310 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001311 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001312 None),
1313 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001314 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315 None),
1316 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001317 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318 None),
1319 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001320 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 None),
1322 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001323 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001324 None),
1325 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001326 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327 None),
1328 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001329 (b'\xcd\x81',
1330 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001331 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001332 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333 None),
1334 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001335 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336 None),
1337 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001338 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339 None),
1340 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001341 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001342 None),
1343 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001344 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345 None),
1346 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001347 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 None),
1349 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'foo\xef\xb9\xb6bar',
1351 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 None),
1355 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'\xd8\xa71\xd8\xa8',
1357 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001359 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001361 # None),
1362 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363 # 3.44 Larger test (shrinking).
1364 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1366 b'\xaa\xce\xb0\xe2\x80\x80',
1367 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 # 3.45 Larger test (expanding).
1369 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1371 b'\x80',
1372 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1373 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1374 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 ]
1376
1377
1378class NameprepTest(unittest.TestCase):
1379 def test_nameprep(self):
1380 from encodings.idna import nameprep
1381 for pos, (orig, prepped) in enumerate(nameprep_tests):
1382 if orig is None:
1383 # Skipped
1384 continue
1385 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001386 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001387 if prepped is None:
1388 # Input contains prohibited characters
1389 self.assertRaises(UnicodeError, nameprep, orig)
1390 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001391 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001393 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001394 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001395 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001397class IDNACodecTest(unittest.TestCase):
1398 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001399 self.assertEqual(str(b"python.org", "idna"), "python.org")
1400 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1401 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1402 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001403
1404 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001405 self.assertEqual("python.org".encode("idna"), b"python.org")
1406 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1407 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1408 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001409
Martin v. Löwis8b595142005-08-25 11:03:38 +00001410 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001411 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001412 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001413 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001414
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001415 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001416 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001417 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001418 "python.org"
1419 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001420 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001421 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001422 "python.org."
1423 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001424 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001425 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001426 "pyth\xf6n.org."
1427 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001428 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001429 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001430 "pyth\xf6n.org."
1431 )
1432
1433 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001434 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1435 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1436 self.assertEqual(decoder.decode(b"rg"), "")
1437 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001438
1439 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001440 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1441 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1442 self.assertEqual(decoder.decode(b"rg."), "org.")
1443 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001444
1445 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001446 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 b"".join(codecs.iterencode("python.org", "idna")),
1448 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001449 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001450 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001451 b"".join(codecs.iterencode("python.org.", "idna")),
1452 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001453 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001454 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1456 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001457 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001458 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1460 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001461 )
1462
1463 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001464 self.assertEqual(encoder.encode("\xe4x"), b"")
1465 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1466 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001467
1468 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001469 self.assertEqual(encoder.encode("\xe4x"), b"")
1470 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1471 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001473class CodecsModuleTest(unittest.TestCase):
1474
1475 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001476 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1477 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001478 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001479 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001480 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001481
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001482 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001483 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1484 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001485 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001486 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001487 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001488 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001489
1490 def test_register(self):
1491 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001492 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001493
1494 def test_lookup(self):
1495 self.assertRaises(TypeError, codecs.lookup)
1496 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001497 self.assertRaises(LookupError, codecs.lookup, " ")
1498
1499 def test_getencoder(self):
1500 self.assertRaises(TypeError, codecs.getencoder)
1501 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1502
1503 def test_getdecoder(self):
1504 self.assertRaises(TypeError, codecs.getdecoder)
1505 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1506
1507 def test_getreader(self):
1508 self.assertRaises(TypeError, codecs.getreader)
1509 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1510
1511 def test_getwriter(self):
1512 self.assertRaises(TypeError, codecs.getwriter)
1513 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001514
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001515 def test_lookup_issue1813(self):
1516 # Issue #1813: under Turkish locales, lookup of some codecs failed
1517 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001518 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001519 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1520 try:
1521 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1522 except locale.Error:
1523 # Unsupported locale on this system
1524 self.skipTest('test needs Turkish locale')
1525 c = codecs.lookup('ASCII')
1526 self.assertEqual(c.name, 'ascii')
1527
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001528class StreamReaderTest(unittest.TestCase):
1529
1530 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001531 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001532 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001533
1534 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001535 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001537
Thomas Wouters89f507f2006-12-13 04:49:30 +00001538class EncodedFileTest(unittest.TestCase):
1539
1540 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001541 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001542 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001543 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001544
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001545 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001546 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001547 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001548 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001549
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001550all_unicode_encodings = [
1551 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001552 "big5",
1553 "big5hkscs",
1554 "charmap",
1555 "cp037",
1556 "cp1006",
1557 "cp1026",
1558 "cp1140",
1559 "cp1250",
1560 "cp1251",
1561 "cp1252",
1562 "cp1253",
1563 "cp1254",
1564 "cp1255",
1565 "cp1256",
1566 "cp1257",
1567 "cp1258",
1568 "cp424",
1569 "cp437",
1570 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001571 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001572 "cp737",
1573 "cp775",
1574 "cp850",
1575 "cp852",
1576 "cp855",
1577 "cp856",
1578 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001579 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001580 "cp860",
1581 "cp861",
1582 "cp862",
1583 "cp863",
1584 "cp864",
1585 "cp865",
1586 "cp866",
1587 "cp869",
1588 "cp874",
1589 "cp875",
1590 "cp932",
1591 "cp949",
1592 "cp950",
1593 "euc_jis_2004",
1594 "euc_jisx0213",
1595 "euc_jp",
1596 "euc_kr",
1597 "gb18030",
1598 "gb2312",
1599 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001600 "hp_roman8",
1601 "hz",
1602 "idna",
1603 "iso2022_jp",
1604 "iso2022_jp_1",
1605 "iso2022_jp_2",
1606 "iso2022_jp_2004",
1607 "iso2022_jp_3",
1608 "iso2022_jp_ext",
1609 "iso2022_kr",
1610 "iso8859_1",
1611 "iso8859_10",
1612 "iso8859_11",
1613 "iso8859_13",
1614 "iso8859_14",
1615 "iso8859_15",
1616 "iso8859_16",
1617 "iso8859_2",
1618 "iso8859_3",
1619 "iso8859_4",
1620 "iso8859_5",
1621 "iso8859_6",
1622 "iso8859_7",
1623 "iso8859_8",
1624 "iso8859_9",
1625 "johab",
1626 "koi8_r",
1627 "koi8_u",
1628 "latin_1",
1629 "mac_cyrillic",
1630 "mac_greek",
1631 "mac_iceland",
1632 "mac_latin2",
1633 "mac_roman",
1634 "mac_turkish",
1635 "palmos",
1636 "ptcp154",
1637 "punycode",
1638 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001639 "shift_jis",
1640 "shift_jis_2004",
1641 "shift_jisx0213",
1642 "tis_620",
1643 "unicode_escape",
1644 "unicode_internal",
1645 "utf_16",
1646 "utf_16_be",
1647 "utf_16_le",
1648 "utf_7",
1649 "utf_8",
1650]
1651
1652if hasattr(codecs, "mbcs_encode"):
1653 all_unicode_encodings.append("mbcs")
1654
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001655# The following encoding is not tested, because it's not supposed
1656# to work:
1657# "undefined"
1658
1659# The following encodings don't work in stateful mode
1660broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001661 "punycode",
1662 "unicode_internal"
1663]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001664broken_incremental_coders = broken_unicode_with_streams + [
1665 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001666]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001667
Walter Dörwald3abcb012007-04-16 22:10:50 +00001668class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001669 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001670 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001671 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001672 name = codecs.lookup(encoding).name
1673 if encoding.endswith("_codec"):
1674 name += "_codec"
1675 elif encoding == "latin_1":
1676 name = "latin_1"
1677 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001678
Ezio Melottiadc417c2011-11-17 12:23:34 +02001679 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001680 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001681 (b, size) = codecs.getencoder(encoding)(s)
1682 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1683 (chars, size) = codecs.getdecoder(encoding)(b)
1684 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001685
1686 if encoding not in broken_unicode_with_streams:
1687 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001688 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001689 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001690 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001691 for c in s:
1692 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001693 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001694 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001695 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001696 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001697 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001698 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001699 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001700 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001701 decodedresult += reader.read()
1702 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1703
Thomas Wouters89f507f2006-12-13 04:49:30 +00001704 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001705 # check incremental decoder/encoder (fetched via the Python
1706 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001707 try:
1708 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001709 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001710 except LookupError: # no IncrementalEncoder
1711 pass
1712 else:
1713 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001714 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001715 for c in s:
1716 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001717 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001718 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001719 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001720 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001721 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001722 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001723 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1724
1725 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001726 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001727 for c in s:
1728 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001729 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001730 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001731 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001732 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001733 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001734 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001735 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1736
1737 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001738 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001739 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1740
1741 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001742 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1743 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001744
Victor Stinner554f3f02010-06-16 23:33:54 +00001745 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001746 # check incremental decoder/encoder with errors argument
1747 try:
1748 encoder = codecs.getincrementalencoder(encoding)("ignore")
1749 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1750 except LookupError: # no IncrementalEncoder
1751 pass
1752 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001753 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001754 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001755 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001756 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1757
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001758 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001759 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001760 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001761 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1762
Walter Dörwald729c31f2005-03-14 19:06:30 +00001763 def test_seek(self):
1764 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001765 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001766 for encoding in all_unicode_encodings:
1767 if encoding == "idna": # FIXME: See SF bug #1163178
1768 continue
1769 if encoding in broken_unicode_with_streams:
1770 continue
Victor Stinner05010702011-05-27 16:50:40 +02001771 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001772 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001773 # Test that calling seek resets the internal codec state and buffers
1774 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001775 data = reader.read()
1776 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001777
Walter Dörwalde22d3392005-11-17 08:52:34 +00001778 def test_bad_decode_args(self):
1779 for encoding in all_unicode_encodings:
1780 decoder = codecs.getdecoder(encoding)
1781 self.assertRaises(TypeError, decoder)
1782 if encoding not in ("idna", "punycode"):
1783 self.assertRaises(TypeError, decoder, 42)
1784
1785 def test_bad_encode_args(self):
1786 for encoding in all_unicode_encodings:
1787 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001788 with support.check_warnings():
1789 # unicode-internal has been deprecated
1790 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001791
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001792 def test_encoding_map_type_initialized(self):
1793 from encodings import cp1140
1794 # This used to crash, we are only verifying there's no crash.
1795 table_type = type(cp1140.encoding_table)
1796 self.assertEqual(table_type, table_type)
1797
Walter Dörwald3abcb012007-04-16 22:10:50 +00001798 def test_decoder_state(self):
1799 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001800 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001801 for encoding in all_unicode_encodings:
1802 if encoding not in broken_incremental_coders:
1803 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1804 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1805
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001806class CharmapTest(unittest.TestCase):
1807 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001808 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001809 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001810 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001811 )
1812
Ezio Melottib3aedd42010-11-20 19:04:17 +00001813 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001814 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1815 ("\U0010FFFFbc", 3)
1816 )
1817
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001818 self.assertRaises(UnicodeDecodeError,
1819 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1820 )
1821
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001822 self.assertRaises(UnicodeDecodeError,
1823 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1824 )
1825
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001826 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001827 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001828 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001829 )
1830
Ezio Melottib3aedd42010-11-20 19:04:17 +00001831 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001832 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001833 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001834 )
1835
Ezio Melottib3aedd42010-11-20 19:04:17 +00001836 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001837 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001838 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001839 )
1840
Ezio Melottib3aedd42010-11-20 19:04:17 +00001841 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001842 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001843 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001844 )
1845
Guido van Rossum805365e2007-05-07 22:24:25 +00001846 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001847 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001848 codecs.charmap_decode(allbytes, "ignore", ""),
1849 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001850 )
1851
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001852 def test_decode_with_int2str_map(self):
1853 self.assertEqual(
1854 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1855 {0: 'a', 1: 'b', 2: 'c'}),
1856 ("abc", 3)
1857 )
1858
1859 self.assertEqual(
1860 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1861 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1862 ("AaBbCc", 3)
1863 )
1864
1865 self.assertEqual(
1866 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1867 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1868 ("\U0010FFFFbc", 3)
1869 )
1870
1871 self.assertEqual(
1872 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1873 {0: 'a', 1: 'b', 2: ''}),
1874 ("ab", 3)
1875 )
1876
1877 self.assertRaises(UnicodeDecodeError,
1878 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1879 {0: 'a', 1: 'b'}
1880 )
1881
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001882 self.assertRaises(UnicodeDecodeError,
1883 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1884 {0: 'a', 1: 'b', 2: None}
1885 )
1886
1887 # Issue #14850
1888 self.assertRaises(UnicodeDecodeError,
1889 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1890 {0: 'a', 1: 'b', 2: '\ufffe'}
1891 )
1892
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001893 self.assertEqual(
1894 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1895 {0: 'a', 1: 'b'}),
1896 ("ab\ufffd", 3)
1897 )
1898
1899 self.assertEqual(
1900 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1901 {0: 'a', 1: 'b', 2: None}),
1902 ("ab\ufffd", 3)
1903 )
1904
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001905 # Issue #14850
1906 self.assertEqual(
1907 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1908 {0: 'a', 1: 'b', 2: '\ufffe'}),
1909 ("ab\ufffd", 3)
1910 )
1911
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001912 self.assertEqual(
1913 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1914 {0: 'a', 1: 'b'}),
1915 ("ab", 3)
1916 )
1917
1918 self.assertEqual(
1919 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1920 {0: 'a', 1: 'b', 2: None}),
1921 ("ab", 3)
1922 )
1923
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001924 # Issue #14850
1925 self.assertEqual(
1926 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1927 {0: 'a', 1: 'b', 2: '\ufffe'}),
1928 ("ab", 3)
1929 )
1930
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001931 allbytes = bytes(range(256))
1932 self.assertEqual(
1933 codecs.charmap_decode(allbytes, "ignore", {}),
1934 ("", len(allbytes))
1935 )
1936
1937 def test_decode_with_int2int_map(self):
1938 a = ord('a')
1939 b = ord('b')
1940 c = ord('c')
1941
1942 self.assertEqual(
1943 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1944 {0: a, 1: b, 2: c}),
1945 ("abc", 3)
1946 )
1947
1948 # Issue #15379
1949 self.assertEqual(
1950 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1951 {0: 0x10FFFF, 1: b, 2: c}),
1952 ("\U0010FFFFbc", 3)
1953 )
1954
Antoine Pitroua1f76552012-09-23 20:00:04 +02001955 self.assertEqual(
1956 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1957 {0: sys.maxunicode, 1: b, 2: c}),
1958 (chr(sys.maxunicode) + "bc", 3)
1959 )
1960
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001961 self.assertRaises(TypeError,
1962 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001963 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001964 )
1965
1966 self.assertRaises(UnicodeDecodeError,
1967 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1968 {0: a, 1: b},
1969 )
1970
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001971 self.assertRaises(UnicodeDecodeError,
1972 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1973 {0: a, 1: b, 2: 0xFFFE},
1974 )
1975
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001976 self.assertEqual(
1977 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1978 {0: a, 1: b}),
1979 ("ab\ufffd", 3)
1980 )
1981
1982 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001983 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1984 {0: a, 1: b, 2: 0xFFFE}),
1985 ("ab\ufffd", 3)
1986 )
1987
1988 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001989 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1990 {0: a, 1: b}),
1991 ("ab", 3)
1992 )
1993
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001994 self.assertEqual(
1995 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1996 {0: a, 1: b, 2: 0xFFFE}),
1997 ("ab", 3)
1998 )
1999
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002000
Thomas Wouters89f507f2006-12-13 04:49:30 +00002001class WithStmtTest(unittest.TestCase):
2002 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002003 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002004 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2005 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002006
2007 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002008 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002009 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002010 with codecs.StreamReaderWriter(f, info.streamreader,
2011 info.streamwriter, 'strict') as srw:
2012 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002013
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002014class TypesTest(unittest.TestCase):
2015 def test_decode_unicode(self):
2016 # Most decoders don't accept unicode input
2017 decoders = [
2018 codecs.utf_7_decode,
2019 codecs.utf_8_decode,
2020 codecs.utf_16_le_decode,
2021 codecs.utf_16_be_decode,
2022 codecs.utf_16_ex_decode,
2023 codecs.utf_32_decode,
2024 codecs.utf_32_le_decode,
2025 codecs.utf_32_be_decode,
2026 codecs.utf_32_ex_decode,
2027 codecs.latin_1_decode,
2028 codecs.ascii_decode,
2029 codecs.charmap_decode,
2030 ]
2031 if hasattr(codecs, "mbcs_decode"):
2032 decoders.append(codecs.mbcs_decode)
2033 for decoder in decoders:
2034 self.assertRaises(TypeError, decoder, "xxx")
2035
2036 def test_unicode_escape(self):
2037 # Escape-decoding an unicode string is supported ang gives the same
2038 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002039 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2040 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2041 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2042 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002043
Victor Stinnere3b47152011-12-09 20:49:49 +01002044 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2045 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2046
2047 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2048 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2049
Serhiy Storchakad6793772013-01-29 10:20:44 +02002050
2051class UnicodeEscapeTest(unittest.TestCase):
2052 def test_empty(self):
2053 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2054 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2055
2056 def test_raw_encode(self):
2057 encode = codecs.unicode_escape_encode
2058 for b in range(32, 127):
2059 if b != b'\\'[0]:
2060 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2061
2062 def test_raw_decode(self):
2063 decode = codecs.unicode_escape_decode
2064 for b in range(256):
2065 if b != b'\\'[0]:
2066 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2067
2068 def test_escape_encode(self):
2069 encode = codecs.unicode_escape_encode
2070 check = coding_checker(self, encode)
2071 check('\t', br'\t')
2072 check('\n', br'\n')
2073 check('\r', br'\r')
2074 check('\\', br'\\')
2075 for b in range(32):
2076 if chr(b) not in '\t\n\r':
2077 check(chr(b), ('\\x%02x' % b).encode())
2078 for b in range(127, 256):
2079 check(chr(b), ('\\x%02x' % b).encode())
2080 check('\u20ac', br'\u20ac')
2081 check('\U0001d120', br'\U0001d120')
2082
2083 def test_escape_decode(self):
2084 decode = codecs.unicode_escape_decode
2085 check = coding_checker(self, decode)
2086 check(b"[\\\n]", "[]")
2087 check(br'[\"]', '["]')
2088 check(br"[\']", "[']")
2089 check(br"[\\]", r"[\]")
2090 check(br"[\a]", "[\x07]")
2091 check(br"[\b]", "[\x08]")
2092 check(br"[\t]", "[\x09]")
2093 check(br"[\n]", "[\x0a]")
2094 check(br"[\v]", "[\x0b]")
2095 check(br"[\f]", "[\x0c]")
2096 check(br"[\r]", "[\x0d]")
2097 check(br"[\7]", "[\x07]")
2098 check(br"[\8]", r"[\8]")
2099 check(br"[\78]", "[\x078]")
2100 check(br"[\41]", "[!]")
2101 check(br"[\418]", "[!8]")
2102 check(br"[\101]", "[A]")
2103 check(br"[\1010]", "[A0]")
2104 check(br"[\x41]", "[A]")
2105 check(br"[\x410]", "[A0]")
2106 check(br"\u20ac", "\u20ac")
2107 check(br"\U0001d120", "\U0001d120")
2108 for b in range(256):
2109 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2110 check(b'\\' + bytes([b]), '\\' + chr(b))
2111
2112 def test_decode_errors(self):
2113 decode = codecs.unicode_escape_decode
2114 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2115 for i in range(d):
2116 self.assertRaises(UnicodeDecodeError, decode,
2117 b"\\" + c + b"0"*i)
2118 self.assertRaises(UnicodeDecodeError, decode,
2119 b"[\\" + c + b"0"*i + b"]")
2120 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2121 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2122 self.assertEqual(decode(data, "replace"),
2123 ("[\ufffd]\ufffd", len(data)))
2124 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2125 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2126 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2127
2128
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002129class RawUnicodeEscapeTest(unittest.TestCase):
2130 def test_empty(self):
2131 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2132 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2133
2134 def test_raw_encode(self):
2135 encode = codecs.raw_unicode_escape_encode
2136 for b in range(256):
2137 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2138
2139 def test_raw_decode(self):
2140 decode = codecs.raw_unicode_escape_decode
2141 for b in range(256):
2142 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2143
2144 def test_escape_encode(self):
2145 encode = codecs.raw_unicode_escape_encode
2146 check = coding_checker(self, encode)
2147 for b in range(256):
2148 if b not in b'uU':
2149 check('\\' + chr(b), b'\\' + bytes([b]))
2150 check('\u20ac', br'\u20ac')
2151 check('\U0001d120', br'\U0001d120')
2152
2153 def test_escape_decode(self):
2154 decode = codecs.raw_unicode_escape_decode
2155 check = coding_checker(self, decode)
2156 for b in range(256):
2157 if b not in b'uU':
2158 check(b'\\' + bytes([b]), '\\' + chr(b))
2159 check(br"\u20ac", "\u20ac")
2160 check(br"\U0001d120", "\U0001d120")
2161
2162 def test_decode_errors(self):
2163 decode = codecs.raw_unicode_escape_decode
2164 for c, d in (b'u', 4), (b'U', 4):
2165 for i in range(d):
2166 self.assertRaises(UnicodeDecodeError, decode,
2167 b"\\" + c + b"0"*i)
2168 self.assertRaises(UnicodeDecodeError, decode,
2169 b"[\\" + c + b"0"*i + b"]")
2170 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2171 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2172 self.assertEqual(decode(data, "replace"),
2173 ("[\ufffd]\ufffd", len(data)))
2174 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2175 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2176 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2177
2178
Martin v. Löwis43c57782009-05-10 08:15:24 +00002179class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002180
2181 def test_utf8(self):
2182 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002183 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002184 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002185 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002186 b"foo\x80bar")
2187 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002188 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002189 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002190 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002191 b"\xed\xb0\x80")
2192
2193 def test_ascii(self):
2194 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002195 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002196 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002197 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002198 b"foo\x80bar")
2199
2200 def test_charmap(self):
2201 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002202 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002203 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002204 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002205 b"foo\xa5bar")
2206
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002207 def test_latin1(self):
2208 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002209 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002210 b"\xe4\xeb\xef\xf6\xfc")
2211
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002212
Victor Stinner3fed0872010-05-22 02:16:27 +00002213class BomTest(unittest.TestCase):
2214 def test_seek0(self):
2215 data = "1234567890"
2216 tests = ("utf-16",
2217 "utf-16-le",
2218 "utf-16-be",
2219 "utf-32",
2220 "utf-32-le",
2221 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002222 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002223 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002224 # Check if the BOM is written only once
2225 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002226 f.write(data)
2227 f.write(data)
2228 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002229 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002230 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002231 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002232
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002233 # Check that the BOM is written after a seek(0)
2234 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2235 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002236 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002237 f.seek(0)
2238 f.write(data)
2239 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002240 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002241
2242 # (StreamWriter) Check that the BOM is written after a seek(0)
2243 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002244 f.writer.write(data[0])
2245 self.assertNotEqual(f.writer.tell(), 0)
2246 f.writer.seek(0)
2247 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002248 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002249 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002250
Victor Stinner05010702011-05-27 16:50:40 +02002251 # Check that the BOM is not written after a seek() at a position
2252 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002253 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2254 f.write(data)
2255 f.seek(f.tell())
2256 f.write(data)
2257 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002258 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002259
Victor Stinner05010702011-05-27 16:50:40 +02002260 # (StreamWriter) Check that the BOM is not written after a seek()
2261 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002262 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002263 f.writer.write(data)
2264 f.writer.seek(f.writer.tell())
2265 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002266 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002267 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002268
Victor Stinner3fed0872010-05-22 02:16:27 +00002269
Georg Brandl02524622010-12-02 18:06:51 +00002270bytes_transform_encodings = [
2271 "base64_codec",
2272 "uu_codec",
2273 "quopri_codec",
2274 "hex_codec",
2275]
2276try:
2277 import zlib
2278except ImportError:
2279 pass
2280else:
2281 bytes_transform_encodings.append("zlib_codec")
2282try:
2283 import bz2
2284except ImportError:
2285 pass
2286else:
2287 bytes_transform_encodings.append("bz2_codec")
2288
2289class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002290
Georg Brandl02524622010-12-02 18:06:51 +00002291 def test_basics(self):
2292 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002293 for encoding in bytes_transform_encodings:
2294 # generic codecs interface
2295 (o, size) = codecs.getencoder(encoding)(binput)
2296 self.assertEqual(size, len(binput))
2297 (i, size) = codecs.getdecoder(encoding)(o)
2298 self.assertEqual(size, len(o))
2299 self.assertEqual(i, binput)
2300
Georg Brandl02524622010-12-02 18:06:51 +00002301 def test_read(self):
2302 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002303 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002304 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002305 sout = reader.read()
2306 self.assertEqual(sout, b"\x80")
2307
2308 def test_readline(self):
2309 for encoding in bytes_transform_encodings:
2310 if encoding in ['uu_codec', 'zlib_codec']:
2311 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002312 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002313 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002314 sout = reader.readline()
2315 self.assertEqual(sout, b"\x80")
2316
2317
Victor Stinner62be4fb2011-10-18 21:46:37 +02002318@unittest.skipUnless(sys.platform == 'win32',
2319 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002320class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002321 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002322 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002323
Victor Stinner3a50e702011-10-18 21:21:00 +02002324 def test_invalid_code_page(self):
2325 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2326 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2327 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2328 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2329
2330 def test_code_page_name(self):
2331 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2332 codecs.code_page_encode, 932, '\xff')
2333 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2334 codecs.code_page_decode, 932, b'\x81\x00')
2335 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2336 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2337
2338 def check_decode(self, cp, tests):
2339 for raw, errors, expected in tests:
2340 if expected is not None:
2341 try:
2342 decoded = codecs.code_page_decode(cp, raw, errors)
2343 except UnicodeDecodeError as err:
2344 self.fail('Unable to decode %a from "cp%s" with '
2345 'errors=%r: %s' % (raw, cp, errors, err))
2346 self.assertEqual(decoded[0], expected,
2347 '%a.decode("cp%s", %r)=%a != %a'
2348 % (raw, cp, errors, decoded[0], expected))
2349 # assert 0 <= decoded[1] <= len(raw)
2350 self.assertGreaterEqual(decoded[1], 0)
2351 self.assertLessEqual(decoded[1], len(raw))
2352 else:
2353 self.assertRaises(UnicodeDecodeError,
2354 codecs.code_page_decode, cp, raw, errors)
2355
2356 def check_encode(self, cp, tests):
2357 for text, errors, expected in tests:
2358 if expected is not None:
2359 try:
2360 encoded = codecs.code_page_encode(cp, text, errors)
2361 except UnicodeEncodeError as err:
2362 self.fail('Unable to encode %a to "cp%s" with '
2363 'errors=%r: %s' % (text, cp, errors, err))
2364 self.assertEqual(encoded[0], expected,
2365 '%a.encode("cp%s", %r)=%a != %a'
2366 % (text, cp, errors, encoded[0], expected))
2367 self.assertEqual(encoded[1], len(text))
2368 else:
2369 self.assertRaises(UnicodeEncodeError,
2370 codecs.code_page_encode, cp, text, errors)
2371
2372 def test_cp932(self):
2373 self.check_encode(932, (
2374 ('abc', 'strict', b'abc'),
2375 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002376 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002377 ('\xff', 'strict', None),
2378 ('[\xff]', 'ignore', b'[]'),
2379 ('[\xff]', 'replace', b'[y]'),
2380 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002381 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2382 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002383 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002384 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002385 (b'abc', 'strict', 'abc'),
2386 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2387 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002388 (b'[\xff]', 'strict', None),
2389 (b'[\xff]', 'ignore', '[]'),
2390 (b'[\xff]', 'replace', '[\ufffd]'),
2391 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002392 (b'\x81\x00abc', 'strict', None),
2393 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002394 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2395 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002396
2397 def test_cp1252(self):
2398 self.check_encode(1252, (
2399 ('abc', 'strict', b'abc'),
2400 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2401 ('\xff', 'strict', b'\xff'),
2402 ('\u0141', 'strict', None),
2403 ('\u0141', 'ignore', b''),
2404 ('\u0141', 'replace', b'L'),
2405 ))
2406 self.check_decode(1252, (
2407 (b'abc', 'strict', 'abc'),
2408 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2409 (b'\xff', 'strict', '\xff'),
2410 ))
2411
2412 def test_cp_utf7(self):
2413 cp = 65000
2414 self.check_encode(cp, (
2415 ('abc', 'strict', b'abc'),
2416 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2417 ('\U0010ffff', 'strict', b'+2//f/w-'),
2418 ('\udc80', 'strict', b'+3IA-'),
2419 ('\ufffd', 'strict', b'+//0-'),
2420 ))
2421 self.check_decode(cp, (
2422 (b'abc', 'strict', 'abc'),
2423 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2424 (b'+2//f/w-', 'strict', '\U0010ffff'),
2425 (b'+3IA-', 'strict', '\udc80'),
2426 (b'+//0-', 'strict', '\ufffd'),
2427 # invalid bytes
2428 (b'[+/]', 'strict', '[]'),
2429 (b'[\xff]', 'strict', '[\xff]'),
2430 ))
2431
Victor Stinner3a50e702011-10-18 21:21:00 +02002432 def test_multibyte_encoding(self):
2433 self.check_decode(932, (
2434 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2435 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2436 ))
2437 self.check_decode(self.CP_UTF8, (
2438 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2439 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2440 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002441 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002442 self.check_encode(self.CP_UTF8, (
2443 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2444 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2445 ))
2446
2447 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002448 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2449 self.assertEqual(decoded, ('', 0))
2450
Victor Stinner3a50e702011-10-18 21:21:00 +02002451 decoded = codecs.code_page_decode(932,
2452 b'\xe9\x80\xe9', 'strict',
2453 False)
2454 self.assertEqual(decoded, ('\u9a3e', 2))
2455
2456 decoded = codecs.code_page_decode(932,
2457 b'\xe9\x80\xe9\x80', 'strict',
2458 False)
2459 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2460
2461 decoded = codecs.code_page_decode(932,
2462 b'abc', 'strict',
2463 False)
2464 self.assertEqual(decoded, ('abc', 3))
2465
2466
Fred Drake2e2be372001-09-20 21:33:42 +00002467if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002468 unittest.main()