blob: 5cef4dac1c76dd090aa15332b1654ec373f7f9b6 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Serhiy Storchakad6793772013-01-29 10:20:44 +020024def coding_checker(self, coder):
25 def check(input, expect):
26 self.assertEqual(coder(input), (expect, len(input)))
27 return check
28
Walter Dörwald69652032004-09-07 20:24:22 +000029class Queue(object):
30 """
31 queue: write bytes at one end, read bytes from the other end
32 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000033 def __init__(self, buffer):
34 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000035
36 def write(self, chars):
37 self._buffer += chars
38
39 def read(self, size=-1):
40 if size<0:
41 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000043 return s
44 else:
45 s = self._buffer[:size]
46 self._buffer = self._buffer[size:]
47 return s
48
Walter Dörwald3abcb012007-04-16 22:10:50 +000049class MixInCheckStateHandling:
50 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementaldecoder(encoding)()
53 part1 = d.decode(s[:i])
54 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000055 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000056 # Check that the condition stated in the documentation for
57 # IncrementalDecoder.getstate() holds
58 if not state[1]:
59 # reset decoder to the default state without anything buffered
60 d.setstate((state[0][:0], 0))
61 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000062 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000063 # The decoder must return to the same state
64 self.assertEqual(state, d.getstate())
65 # Create a new decoder and set it to the state
66 # we extracted from the old one
67 d = codecs.getincrementaldecoder(encoding)()
68 d.setstate(state)
69 part2 = d.decode(s[i:], True)
70 self.assertEqual(u, part1+part2)
71
72 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000073 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000074 d = codecs.getincrementalencoder(encoding)()
75 part1 = d.encode(u[:i])
76 state = d.getstate()
77 d = codecs.getincrementalencoder(encoding)()
78 d.setstate(state)
79 part2 = d.encode(u[i:], True)
80 self.assertEqual(s, part1+part2)
81
Ezio Melotti5d3dba02013-01-11 06:02:07 +020082class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000084 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000085 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000086 # the StreamReader and check that the results equal the appropriate
87 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020089 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000093 result += r.read()
94 self.assertEqual(result, partialresult)
95 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000097 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000098
Thomas Woutersa9773292006-04-21 09:43:23 +000099 # do the check again, this time using a incremental decoder
100 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000101 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000103 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 self.assertEqual(result, partialresult)
105 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 self.assertEqual(d.decode(b"", True), "")
107 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000108
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000109 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000111 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 self.assertEqual(result, partialresult)
115 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 self.assertEqual(d.decode(b"", True), "")
117 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000118
119 # check iterdecode()
120 encoded = input.encode(self.encoding)
121 self.assertEqual(
122 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000123 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 )
125
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126 def test_readline(self):
127 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000128 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129 return codecs.getreader(self.encoding)(stream)
130
Walter Dörwaldca199432006-03-06 22:39:12 +0000131 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200132 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 lines = []
134 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 if not line:
137 break
138 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000141 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
142 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
143 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000144 self.assertEqual(readalllines(s, True), sexpected)
145 self.assertEqual(readalllines(s, False), sexpectednoends)
146 self.assertEqual(readalllines(s, True, 10), sexpected)
147 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148
149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000152 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
153 vw.append((i*200)*"\3042" + lineend)
154 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000155 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
156 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000161 for lineend in "\n \r\n \r \u2028".split():
162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
169 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000170 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000171 self.assertEqual(
172 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000173 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 )
175
176 def test_bug1175396(self):
177 s = [
178 '<%!--===================================================\r\n',
179 ' BLOG index page: show recent articles,\r\n',
180 ' today\'s articles, or articles of a specific date.\r\n',
181 '========================================================--%>\r\n',
182 '<%@inputencoding="ISO-8859-1"%>\r\n',
183 '<%@pagetemplate=TEMPLATE.y%>\r\n',
184 '<%@import=import frog.util, frog%>\r\n',
185 '<%@import=import frog.objects%>\r\n',
186 '<%@import=from frog.storageerrors import StorageError%>\r\n',
187 '<%\r\n',
188 '\r\n',
189 'import logging\r\n',
190 'log=logging.getLogger("Snakelets.logger")\r\n',
191 '\r\n',
192 '\r\n',
193 'user=self.SessionCtx.user\r\n',
194 'storageEngine=self.SessionCtx.storageEngine\r\n',
195 '\r\n',
196 '\r\n',
197 'def readArticlesFromDate(date, count=None):\r\n',
198 ' entryids=storageEngine.listBlogEntries(date)\r\n',
199 ' entryids.reverse() # descending\r\n',
200 ' if count:\r\n',
201 ' entryids=entryids[:count]\r\n',
202 ' try:\r\n',
203 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
204 ' except StorageError,x:\r\n',
205 ' log.error("Error loading articles: "+str(x))\r\n',
206 ' self.abort("cannot load articles")\r\n',
207 '\r\n',
208 'showdate=None\r\n',
209 '\r\n',
210 'arg=self.Request.getArg()\r\n',
211 'if arg=="today":\r\n',
212 ' #-------------------- TODAY\'S ARTICLES\r\n',
213 ' self.write("<h2>Today\'s articles</h2>")\r\n',
214 ' showdate = frog.util.isodatestr() \r\n',
215 ' entries = readArticlesFromDate(showdate)\r\n',
216 'elif arg=="active":\r\n',
217 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
218 ' self.Yredirect("active.y")\r\n',
219 'elif arg=="login":\r\n',
220 ' #-------------------- LOGIN PAGE redirect\r\n',
221 ' self.Yredirect("login.y")\r\n',
222 'elif arg=="date":\r\n',
223 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
224 ' showdate = self.Request.getParameter("date")\r\n',
225 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
226 ' entries = readArticlesFromDate(showdate)\r\n',
227 'else:\r\n',
228 ' #-------------------- RECENT ARTICLES\r\n',
229 ' self.write("<h2>Recent articles</h2>")\r\n',
230 ' dates=storageEngine.listBlogEntryDates()\r\n',
231 ' if dates:\r\n',
232 ' entries=[]\r\n',
233 ' SHOWAMOUNT=10\r\n',
234 ' for showdate in dates:\r\n',
235 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
236 ' if len(entries)>=SHOWAMOUNT:\r\n',
237 ' break\r\n',
238 ' \r\n',
239 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000240 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200241 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000242 for (i, line) in enumerate(reader):
243 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000246 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200247 writer = codecs.getwriter(self.encoding)(q)
248 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
250 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 writer.write("foo\r")
252 self.assertEqual(reader.readline(keepends=False), "foo")
253 writer.write("\nbar\r")
254 self.assertEqual(reader.readline(keepends=False), "")
255 self.assertEqual(reader.readline(keepends=False), "bar")
256 writer.write("baz")
257 self.assertEqual(reader.readline(keepends=False), "baz")
258 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259
260 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 writer.write("foo\r")
262 self.assertEqual(reader.readline(keepends=True), "foo\r")
263 writer.write("\nbar\r")
264 self.assertEqual(reader.readline(keepends=True), "\n")
265 self.assertEqual(reader.readline(keepends=True), "bar\r")
266 writer.write("baz")
267 self.assertEqual(reader.readline(keepends=True), "baz")
268 self.assertEqual(reader.readline(keepends=True), "")
269 writer.write("foo\r\n")
270 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000271
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000273 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
274 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
275 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000276
277 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000278 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200279 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000280 self.assertEqual(reader.readline(), s1)
281 self.assertEqual(reader.readline(), s2)
282 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000283 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000284
285 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000286 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
287 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
288 s3 = "stillokay:bbbbxx\r\n"
289 s4 = "broken!!!!badbad\r\n"
290 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000291
292 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000293 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200294 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000295 self.assertEqual(reader.readline(), s1)
296 self.assertEqual(reader.readline(), s2)
297 self.assertEqual(reader.readline(), s3)
298 self.assertEqual(reader.readline(), s4)
299 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000300 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000301
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200302class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303 encoding = "utf-32"
304
305 spamle = (b'\xff\xfe\x00\x00'
306 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
307 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
308 spambe = (b'\x00\x00\xfe\xff'
309 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
310 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
311
312 def test_only_one_bom(self):
313 _,_,reader,writer = codecs.lookup(self.encoding)
314 # encode some stream
315 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200316 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 f.write("spam")
318 f.write("spam")
319 d = s.getvalue()
320 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000321 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000322 # try to read it back
323 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000325 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000326
327 def test_badbom(self):
328 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200329 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000330 self.assertRaises(UnicodeError, f.read)
331
332 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200333 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 self.assertRaises(UnicodeError, f.read)
335
336 def test_partial(self):
337 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200338 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000339 [
340 "", # first byte of BOM read
341 "", # second byte of BOM read
342 "", # third byte of BOM read
343 "", # fourth byte of BOM read => byteorder known
344 "",
345 "",
346 "",
347 "\x00",
348 "\x00",
349 "\x00",
350 "\x00",
351 "\x00\xff",
352 "\x00\xff",
353 "\x00\xff",
354 "\x00\xff",
355 "\x00\xff\u0100",
356 "\x00\xff\u0100",
357 "\x00\xff\u0100",
358 "\x00\xff\u0100",
359 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200360 "\x00\xff\u0100\uffff",
361 "\x00\xff\u0100\uffff",
362 "\x00\xff\u0100\uffff",
363 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 ]
365 )
366
Georg Brandl791f4e12009-09-17 11:41:24 +0000367 def test_handlers(self):
368 self.assertEqual(('\ufffd', 1),
369 codecs.utf_32_decode(b'\x01', 'replace', True))
370 self.assertEqual(('', 1),
371 codecs.utf_32_decode(b'\x01', 'ignore', True))
372
Walter Dörwald41980ca2007-08-16 21:55:45 +0000373 def test_errors(self):
374 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
375 b"\xff", "strict", True)
376
377 def test_decoder_state(self):
378 self.check_state_handling_decode(self.encoding,
379 "spamspam", self.spamle)
380 self.check_state_handling_decode(self.encoding,
381 "spamspam", self.spambe)
382
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000383 def test_issue8941(self):
384 # Issue #8941: insufficient result allocation when decoding into
385 # surrogate pairs on UCS-2 builds.
386 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
387 self.assertEqual('\U00010000' * 1024,
388 codecs.utf_32_decode(encoded_le)[0])
389 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
390 self.assertEqual('\U00010000' * 1024,
391 codecs.utf_32_decode(encoded_be)[0])
392
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200393class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000394 encoding = "utf-32-le"
395
396 def test_partial(self):
397 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200398 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 [
400 "",
401 "",
402 "",
403 "\x00",
404 "\x00",
405 "\x00",
406 "\x00",
407 "\x00\xff",
408 "\x00\xff",
409 "\x00\xff",
410 "\x00\xff",
411 "\x00\xff\u0100",
412 "\x00\xff\u0100",
413 "\x00\xff\u0100",
414 "\x00\xff\u0100",
415 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200416 "\x00\xff\u0100\uffff",
417 "\x00\xff\u0100\uffff",
418 "\x00\xff\u0100\uffff",
419 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420 ]
421 )
422
423 def test_simple(self):
424 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
425
426 def test_errors(self):
427 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
428 b"\xff", "strict", True)
429
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000430 def test_issue8941(self):
431 # Issue #8941: insufficient result allocation when decoding into
432 # surrogate pairs on UCS-2 builds.
433 encoded = b'\x00\x00\x01\x00' * 1024
434 self.assertEqual('\U00010000' * 1024,
435 codecs.utf_32_le_decode(encoded)[0])
436
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200437class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 encoding = "utf-32-be"
439
440 def test_partial(self):
441 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000443 [
444 "",
445 "",
446 "",
447 "\x00",
448 "\x00",
449 "\x00",
450 "\x00",
451 "\x00\xff",
452 "\x00\xff",
453 "\x00\xff",
454 "\x00\xff",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100",
458 "\x00\xff\u0100",
459 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200460 "\x00\xff\u0100\uffff",
461 "\x00\xff\u0100\uffff",
462 "\x00\xff\u0100\uffff",
463 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000464 ]
465 )
466
467 def test_simple(self):
468 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
469
470 def test_errors(self):
471 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
472 b"\xff", "strict", True)
473
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000474 def test_issue8941(self):
475 # Issue #8941: insufficient result allocation when decoding into
476 # surrogate pairs on UCS-2 builds.
477 encoded = b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_be_decode(encoded)[0])
480
481
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200482class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000484
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000485 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
486 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000487
488 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000489 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000493 f.write("spam")
494 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000495 d = s.getvalue()
496 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000497 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000498 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000499 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200500 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000501 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000502
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000503 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000504 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200505 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000506 self.assertRaises(UnicodeError, f.read)
507
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000508 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200509 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000510 self.assertRaises(UnicodeError, f.read)
511
Walter Dörwald69652032004-09-07 20:24:22 +0000512 def test_partial(self):
513 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200514 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000515 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 "", # first byte of BOM read
517 "", # second byte of BOM read => byteorder known
518 "",
519 "\x00",
520 "\x00",
521 "\x00\xff",
522 "\x00\xff",
523 "\x00\xff\u0100",
524 "\x00\xff\u0100",
525 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff",
527 "\x00\xff\u0100\uffff",
528 "\x00\xff\u0100\uffff",
529 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000530 ]
531 )
532
Georg Brandl791f4e12009-09-17 11:41:24 +0000533 def test_handlers(self):
534 self.assertEqual(('\ufffd', 1),
535 codecs.utf_16_decode(b'\x01', 'replace', True))
536 self.assertEqual(('', 1),
537 codecs.utf_16_decode(b'\x01', 'ignore', True))
538
Walter Dörwalde22d3392005-11-17 08:52:34 +0000539 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000540 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000541 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000542
543 def test_decoder_state(self):
544 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000545 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000546 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000548
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000549 def test_bug691291(self):
550 # Files are always opened in binary mode, even if no binary mode was
551 # specified. This means that no automatic conversion of '\n' is done
552 # on reading and writing.
553 s1 = 'Hello\r\nworld\r\n'
554
555 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200556 self.addCleanup(support.unlink, support.TESTFN)
557 with open(support.TESTFN, 'wb') as fp:
558 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200559 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200560 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000561
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200562class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000564
565 def test_partial(self):
566 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200567 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000568 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 "",
570 "\x00",
571 "\x00",
572 "\x00\xff",
573 "\x00\xff",
574 "\x00\xff\u0100",
575 "\x00\xff\u0100",
576 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200577 "\x00\xff\u0100\uffff",
578 "\x00\xff\u0100\uffff",
579 "\x00\xff\u0100\uffff",
580 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000581 ]
582 )
583
Walter Dörwalde22d3392005-11-17 08:52:34 +0000584 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200585 tests = [
586 (b'\xff', '\ufffd'),
587 (b'A\x00Z', 'A\ufffd'),
588 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
589 (b'\x00\xd8', '\ufffd'),
590 (b'\x00\xd8A', '\ufffd'),
591 (b'\x00\xd8A\x00', '\ufffdA'),
592 (b'\x00\xdcA\x00', '\ufffdA'),
593 ]
594 for raw, expected in tests:
595 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
596 raw, 'strict', True)
597 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598
Victor Stinner53a9dd72010-12-08 22:25:45 +0000599 def test_nonbmp(self):
600 self.assertEqual("\U00010203".encode(self.encoding),
601 b'\x00\xd8\x03\xde')
602 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
603 "\U00010203")
604
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200605class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000606 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000607
608 def test_partial(self):
609 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200610 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000611 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200628 tests = [
629 (b'\xff', '\ufffd'),
630 (b'\x00A\xff', 'A\ufffd'),
631 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
632 (b'\xd8\x00', '\ufffd'),
633 (b'\xd8\x00\xdc', '\ufffd'),
634 (b'\xd8\x00\x00A', '\ufffdA'),
635 (b'\xdc\x00\x00A', '\ufffdA'),
636 ]
637 for raw, expected in tests:
638 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
639 raw, 'strict', True)
640 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000641
Victor Stinner53a9dd72010-12-08 22:25:45 +0000642 def test_nonbmp(self):
643 self.assertEqual("\U00010203".encode(self.encoding),
644 b'\xd8\x00\xde\x03')
645 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
646 "\U00010203")
647
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200648class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000649 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000650
651 def test_partial(self):
652 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200653 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000654 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000655 "\x00",
656 "\x00",
657 "\x00\xff",
658 "\x00\xff",
659 "\x00\xff\u07ff",
660 "\x00\xff\u07ff",
661 "\x00\xff\u07ff",
662 "\x00\xff\u07ff\u0800",
663 "\x00\xff\u07ff\u0800",
664 "\x00\xff\u07ff\u0800",
665 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200666 "\x00\xff\u07ff\u0800\uffff",
667 "\x00\xff\u07ff\u0800\uffff",
668 "\x00\xff\u07ff\u0800\uffff",
669 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000670 ]
671 )
672
Walter Dörwald3abcb012007-04-16 22:10:50 +0000673 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000674 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000675 self.check_state_handling_decode(self.encoding,
676 u, u.encode(self.encoding))
677
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000678 def test_lone_surrogates(self):
679 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
680 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000681 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
682 b'[\\udc80]')
683 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
684 b'[&#56448;]')
685 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
686 b'[\x80]')
687 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
688 b'[]')
689 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
690 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000691
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000692 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000693 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
694 b"abc\xed\xa0\x80def")
695 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
696 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
698 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
699 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
700 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000701 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700702 with self.assertRaises(UnicodeDecodeError):
703 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200704 with self.assertRaises(UnicodeDecodeError):
705 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000706
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200707@unittest.skipUnless(sys.platform == 'win32',
708 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200709class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200710 encoding = "cp65001"
711
712 def test_encode(self):
713 tests = [
714 ('abc', 'strict', b'abc'),
715 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
716 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
717 ]
718 if VISTA_OR_LATER:
719 tests.extend((
720 ('\udc80', 'strict', None),
721 ('\udc80', 'ignore', b''),
722 ('\udc80', 'replace', b'?'),
723 ('\udc80', 'backslashreplace', b'\\udc80'),
724 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
725 ))
726 else:
727 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
728 for text, errors, expected in tests:
729 if expected is not None:
730 try:
731 encoded = text.encode('cp65001', errors)
732 except UnicodeEncodeError as err:
733 self.fail('Unable to encode %a to cp65001 with '
734 'errors=%r: %s' % (text, errors, err))
735 self.assertEqual(encoded, expected,
736 '%a.encode("cp65001", %r)=%a != %a'
737 % (text, errors, encoded, expected))
738 else:
739 self.assertRaises(UnicodeEncodeError,
740 text.encode, "cp65001", errors)
741
742 def test_decode(self):
743 tests = [
744 (b'abc', 'strict', 'abc'),
745 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
746 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
747 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
748 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
749 # invalid bytes
750 (b'[\xff]', 'strict', None),
751 (b'[\xff]', 'ignore', '[]'),
752 (b'[\xff]', 'replace', '[\ufffd]'),
753 (b'[\xff]', 'surrogateescape', '[\udcff]'),
754 ]
755 if VISTA_OR_LATER:
756 tests.extend((
757 (b'[\xed\xb2\x80]', 'strict', None),
758 (b'[\xed\xb2\x80]', 'ignore', '[]'),
759 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
760 ))
761 else:
762 tests.extend((
763 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
764 ))
765 for raw, errors, expected in tests:
766 if expected is not None:
767 try:
768 decoded = raw.decode('cp65001', errors)
769 except UnicodeDecodeError as err:
770 self.fail('Unable to decode %a from cp65001 with '
771 'errors=%r: %s' % (raw, errors, err))
772 self.assertEqual(decoded, expected,
773 '%a.decode("cp65001", %r)=%a != %a'
774 % (raw, errors, decoded, expected))
775 else:
776 self.assertRaises(UnicodeDecodeError,
777 raw.decode, 'cp65001', errors)
778
779 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
780 def test_lone_surrogates(self):
781 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
782 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
783 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
784 b'[\\udc80]')
785 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
786 b'[&#56448;]')
787 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
788 b'[\x80]')
789 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
790 b'[]')
791 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
792 b'[?]')
793
794 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
795 def test_surrogatepass_handler(self):
796 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
797 b"abc\xed\xa0\x80def")
798 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
799 "abc\ud800def")
800 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
801 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
802 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
803 "\U00010fff\uD800")
804 self.assertTrue(codecs.lookup_error("surrogatepass"))
805
806
807
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200808class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000809 encoding = "utf-7"
810
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000811 def test_partial(self):
812 self.check_partial(
813 "a+-b",
814 [
815 "a",
816 "a",
817 "a+",
818 "a+-",
819 "a+-b",
820 ]
821 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000822
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300823 def test_errors(self):
824 tests = [
825 (b'a\xffb', 'a\ufffdb'),
826 (b'a+IK', 'a\ufffd'),
827 (b'a+IK-b', 'a\ufffdb'),
828 (b'a+IK,b', 'a\ufffdb'),
829 (b'a+IKx', 'a\u20ac\ufffd'),
830 (b'a+IKx-b', 'a\u20ac\ufffdb'),
831 (b'a+IKwgr', 'a\u20ac\ufffd'),
832 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
833 (b'a+IKwgr,', 'a\u20ac\ufffd'),
834 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
835 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
836 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
837 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
838 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
839 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
840 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
841 ]
842 for raw, expected in tests:
843 with self.subTest(raw=raw):
844 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
845 raw, 'strict', True)
846 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
847
848 def test_nonbmp(self):
849 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
850 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
851 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
852
Walter Dörwalde22d3392005-11-17 08:52:34 +0000853class UTF16ExTest(unittest.TestCase):
854
855 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000856 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000857
858 def test_bad_args(self):
859 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
860
861class ReadBufferTest(unittest.TestCase):
862
863 def test_array(self):
864 import array
865 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000866 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000867 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000868 )
869
870 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000871 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000872
873 def test_bad_args(self):
874 self.assertRaises(TypeError, codecs.readbuffer_encode)
875 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
876
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200877class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000878 encoding = "utf-8-sig"
879
880 def test_partial(self):
881 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200882 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000883 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 "",
885 "",
886 "", # First BOM has been read and skipped
887 "",
888 "",
889 "\ufeff", # Second BOM has been read and emitted
890 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000891 "\ufeff\x00", # First byte of encoded "\xff" read
892 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
893 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
894 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000895 "\ufeff\x00\xff\u07ff",
896 "\ufeff\x00\xff\u07ff",
897 "\ufeff\x00\xff\u07ff\u0800",
898 "\ufeff\x00\xff\u07ff\u0800",
899 "\ufeff\x00\xff\u07ff\u0800",
900 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200901 "\ufeff\x00\xff\u07ff\u0800\uffff",
902 "\ufeff\x00\xff\u07ff\u0800\uffff",
903 "\ufeff\x00\xff\u07ff\u0800\uffff",
904 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000905 ]
906 )
907
Thomas Wouters89f507f2006-12-13 04:49:30 +0000908 def test_bug1601501(self):
909 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000910 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000911
Walter Dörwald3abcb012007-04-16 22:10:50 +0000912 def test_bom(self):
913 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000914 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000915 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
916
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000917 def test_stream_bom(self):
918 unistring = "ABC\u00A1\u2200XYZ"
919 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
920
921 reader = codecs.getreader("utf-8-sig")
922 for sizehint in [None] + list(range(1, 11)) + \
923 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200924 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000925 ostream = io.StringIO()
926 while 1:
927 if sizehint is not None:
928 data = istream.read(sizehint)
929 else:
930 data = istream.read()
931
932 if not data:
933 break
934 ostream.write(data)
935
936 got = ostream.getvalue()
937 self.assertEqual(got, unistring)
938
939 def test_stream_bare(self):
940 unistring = "ABC\u00A1\u2200XYZ"
941 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
942
943 reader = codecs.getreader("utf-8-sig")
944 for sizehint in [None] + list(range(1, 11)) + \
945 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200946 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000947 ostream = io.StringIO()
948 while 1:
949 if sizehint is not None:
950 data = istream.read(sizehint)
951 else:
952 data = istream.read()
953
954 if not data:
955 break
956 ostream.write(data)
957
958 got = ostream.getvalue()
959 self.assertEqual(got, unistring)
960
961class EscapeDecodeTest(unittest.TestCase):
962 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200963 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000964
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200965 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200966 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200967 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200968 b = bytes([b])
969 if b != b'\\':
970 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200971
972 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200973 decode = codecs.escape_decode
974 check = coding_checker(self, decode)
975 check(b"[\\\n]", b"[]")
976 check(br'[\"]', b'["]')
977 check(br"[\']", b"[']")
978 check(br"[\\]", br"[\]")
979 check(br"[\a]", b"[\x07]")
980 check(br"[\b]", b"[\x08]")
981 check(br"[\t]", b"[\x09]")
982 check(br"[\n]", b"[\x0a]")
983 check(br"[\v]", b"[\x0b]")
984 check(br"[\f]", b"[\x0c]")
985 check(br"[\r]", b"[\x0d]")
986 check(br"[\7]", b"[\x07]")
987 check(br"[\8]", br"[\8]")
988 check(br"[\78]", b"[\x078]")
989 check(br"[\41]", b"[!]")
990 check(br"[\418]", b"[!8]")
991 check(br"[\101]", b"[A]")
992 check(br"[\1010]", b"[A0]")
993 check(br"[\501]", b"[A]")
994 check(br"[\x41]", b"[A]")
995 check(br"[\X41]", br"[\X41]")
996 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200997 for b in range(256):
998 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200999 b = bytes([b])
1000 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001001
1002 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001003 decode = codecs.escape_decode
1004 self.assertRaises(ValueError, decode, br"\x")
1005 self.assertRaises(ValueError, decode, br"[\x]")
1006 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1007 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1008 self.assertRaises(ValueError, decode, br"\x0")
1009 self.assertRaises(ValueError, decode, br"[\x0]")
1010 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1011 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001012
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001013class RecodingTest(unittest.TestCase):
1014 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001015 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001016 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001017 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001018 f2.close()
1019 # Python used to crash on this at exit because of a refcount
1020 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001021
Martin v. Löwis2548c732003-04-18 10:39:54 +00001022# From RFC 3492
1023punycode_testcases = [
1024 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001025 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1026 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001027 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001029 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001030 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001031 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001032 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001033 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001034 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001035 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1036 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1037 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001038 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001040 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1041 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1042 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001043 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001045 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001046 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1047 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1048 "\u0939\u0948\u0902",
1049 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050
1051 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001052 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001053 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1054 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001055
1056 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001057 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1058 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1059 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001060 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1061 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062
1063 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001064 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1065 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1066 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1067 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001068 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001069
1070 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001071 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1072 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1073 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1074 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1075 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001076 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077
1078 # (K) Vietnamese:
1079 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1080 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001081 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1082 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1083 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1084 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001085 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086
Martin v. Löwis2548c732003-04-18 10:39:54 +00001087 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001088 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001089 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001090
Martin v. Löwis2548c732003-04-18 10:39:54 +00001091 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001092 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1093 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1094 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001095 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001096
1097 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001098 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1099 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1100 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001101 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001102
1103 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001104 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001105 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001106
1107 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001108 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1109 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001110 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001111
1112 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001114 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115
1116 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001117 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001118 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119
1120 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001121 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1122 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001123 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001124 ]
1125
1126for i in punycode_testcases:
1127 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001128 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001129
1130class PunycodeTest(unittest.TestCase):
1131 def test_encode(self):
1132 for uni, puny in punycode_testcases:
1133 # Need to convert both strings to lower case, since
1134 # some of the extended encodings use upper case, but our
1135 # code produces only lower case. Converting just puny to
1136 # lower is also insufficient, since some of the input characters
1137 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001138 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001139 str(uni.encode("punycode"), "ascii").lower(),
1140 str(puny, "ascii").lower()
1141 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142
1143 def test_decode(self):
1144 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001145 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001146 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001147 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001149class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001150 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001151 def test_bug1251300(self):
1152 # Decoding with unicode_internal used to not correctly handle "code
1153 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001154 ok = [
1155 (b"\x00\x10\xff\xff", "\U0010ffff"),
1156 (b"\x00\x00\x01\x01", "\U00000101"),
1157 (b"", ""),
1158 ]
1159 not_ok = [
1160 b"\x7f\xff\xff\xff",
1161 b"\x80\x00\x00\x00",
1162 b"\x81\x00\x00\x00",
1163 b"\x00",
1164 b"\x00\x00\x00\x00\x00",
1165 ]
1166 for internal, uni in ok:
1167 if sys.byteorder == "little":
1168 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001169 with support.check_warnings():
1170 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001171 for internal in not_ok:
1172 if sys.byteorder == "little":
1173 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001174 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001175 'deprecated', DeprecationWarning)):
1176 self.assertRaises(UnicodeDecodeError, internal.decode,
1177 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001178 if sys.byteorder == "little":
1179 invalid = b"\x00\x00\x11\x00"
1180 else:
1181 invalid = b"\x00\x11\x00\x00"
1182 with support.check_warnings():
1183 self.assertRaises(UnicodeDecodeError,
1184 invalid.decode, "unicode_internal")
1185 with support.check_warnings():
1186 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1187 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001188
Victor Stinner182d90d2011-09-29 19:53:55 +02001189 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001190 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001191 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001192 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001193 'deprecated', DeprecationWarning)):
1194 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001195 except UnicodeDecodeError as ex:
1196 self.assertEqual("unicode_internal", ex.encoding)
1197 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1198 self.assertEqual(4, ex.start)
1199 self.assertEqual(8, ex.end)
1200 else:
1201 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001202
Victor Stinner182d90d2011-09-29 19:53:55 +02001203 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001204 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001205 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1206 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001207 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001208 'deprecated', DeprecationWarning)):
1209 ab = "ab".encode("unicode_internal").decode()
1210 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1211 "ascii"),
1212 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001213 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001214
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001215 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001216 with support.check_warnings(('unicode_internal codec has been '
1217 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001218 # Issue 3739
1219 encoder = codecs.getencoder("unicode_internal")
1220 self.assertEqual(encoder("a")[1], 1)
1221 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1222
1223 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001224
Martin v. Löwis2548c732003-04-18 10:39:54 +00001225# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1226nameprep_tests = [
1227 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001228 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1229 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1230 b'\xb8\x8f\xef\xbb\xbf',
1231 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001232 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001233 (b'CAFE',
1234 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235 # 3.3 Case folding 8bit U+00DF (german sharp s).
1236 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001237 (b'\xc3\x9f',
1238 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001240 (b'\xc4\xb0',
1241 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001243 (b'\xc5\x83\xcd\xba',
1244 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1246 # XXX: skip this as it fails in UCS-2 mode
1247 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1248 # 'telc\xe2\x88\x95kg\xcf\x83'),
1249 (None, None),
1250 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001251 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1252 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001253 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001254 (b'\xe1\xbe\xb7',
1255 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256 # 3.9 Self-reverting case folding U+01F0 and normalization.
1257 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001258 (b'\xc7\xb0',
1259 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001261 (b'\xce\x90',
1262 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001264 (b'\xce\xb0',
1265 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001267 (b'\xe1\xba\x96',
1268 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001269 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001270 (b'\xe1\xbd\x96',
1271 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001273 (b' ',
1274 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001276 (b'\xc2\xa0',
1277 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001279 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280 None),
1281 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001282 (b'\xe2\x80\x80',
1283 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001284 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 (b'\xe2\x80\x8b',
1286 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'\xe3\x80\x80',
1289 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001290 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001291 (b'\x10\x7f',
1292 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001294 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001295 None),
1296 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001297 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298 None),
1299 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001300 (b'\xef\xbb\xbf',
1301 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001303 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304 None),
1305 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001306 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307 None),
1308 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001309 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 None),
1311 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001312 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001313 None),
1314 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316 None),
1317 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001318 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001319 None),
1320 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001321 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001322 None),
1323 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001324 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 None),
1326 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001327 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328 None),
1329 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001330 (b'\xcd\x81',
1331 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334 None),
1335 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001336 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337 None),
1338 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001339 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001340 None),
1341 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001342 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001343 None),
1344 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001345 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346 None),
1347 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001348 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349 None),
1350 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'foo\xef\xb9\xb6bar',
1352 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001354 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 None),
1356 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001357 (b'\xd8\xa71\xd8\xa8',
1358 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001360 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001362 # None),
1363 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.44 Larger test (shrinking).
1365 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1367 b'\xaa\xce\xb0\xe2\x80\x80',
1368 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369 # 3.45 Larger test (expanding).
1370 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1372 b'\x80',
1373 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1374 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1375 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 ]
1377
1378
1379class NameprepTest(unittest.TestCase):
1380 def test_nameprep(self):
1381 from encodings.idna import nameprep
1382 for pos, (orig, prepped) in enumerate(nameprep_tests):
1383 if orig is None:
1384 # Skipped
1385 continue
1386 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001387 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 if prepped is None:
1389 # Input contains prohibited characters
1390 self.assertRaises(UnicodeError, nameprep, orig)
1391 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001392 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001394 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001395 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001396 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001398class IDNACodecTest(unittest.TestCase):
1399 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001400 self.assertEqual(str(b"python.org", "idna"), "python.org")
1401 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1402 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1403 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001404
1405 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001406 self.assertEqual("python.org".encode("idna"), b"python.org")
1407 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1408 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1409 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001410
Martin v. Löwis8b595142005-08-25 11:03:38 +00001411 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001412 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001413 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001414 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001415
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001416 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001417 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001418 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001419 "python.org"
1420 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001421 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001422 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001423 "python.org."
1424 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001425 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001426 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001427 "pyth\xf6n.org."
1428 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001429 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001430 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001431 "pyth\xf6n.org."
1432 )
1433
1434 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001435 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1436 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1437 self.assertEqual(decoder.decode(b"rg"), "")
1438 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001439
1440 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001441 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1442 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1443 self.assertEqual(decoder.decode(b"rg."), "org.")
1444 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001445
1446 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001447 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001448 b"".join(codecs.iterencode("python.org", "idna")),
1449 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001450 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001451 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 b"".join(codecs.iterencode("python.org.", "idna")),
1453 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001454 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001455 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1457 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001458 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001459 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001460 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1461 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462 )
1463
1464 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001465 self.assertEqual(encoder.encode("\xe4x"), b"")
1466 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1467 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001468
1469 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001470 self.assertEqual(encoder.encode("\xe4x"), b"")
1471 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1472 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001473
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001474class CodecsModuleTest(unittest.TestCase):
1475
1476 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001477 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1478 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001479 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001480 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001481 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001482
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001483 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001484 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1485 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001486 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001487 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001488 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001489 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001490
1491 def test_register(self):
1492 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001493 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001494
1495 def test_lookup(self):
1496 self.assertRaises(TypeError, codecs.lookup)
1497 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001498 self.assertRaises(LookupError, codecs.lookup, " ")
1499
1500 def test_getencoder(self):
1501 self.assertRaises(TypeError, codecs.getencoder)
1502 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1503
1504 def test_getdecoder(self):
1505 self.assertRaises(TypeError, codecs.getdecoder)
1506 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1507
1508 def test_getreader(self):
1509 self.assertRaises(TypeError, codecs.getreader)
1510 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1511
1512 def test_getwriter(self):
1513 self.assertRaises(TypeError, codecs.getwriter)
1514 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001515
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001516 def test_lookup_issue1813(self):
1517 # Issue #1813: under Turkish locales, lookup of some codecs failed
1518 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001519 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001520 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1521 try:
1522 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1523 except locale.Error:
1524 # Unsupported locale on this system
1525 self.skipTest('test needs Turkish locale')
1526 c = codecs.lookup('ASCII')
1527 self.assertEqual(c.name, 'ascii')
1528
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001529class StreamReaderTest(unittest.TestCase):
1530
1531 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001532 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001533 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001534
1535 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001536 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001538
Thomas Wouters89f507f2006-12-13 04:49:30 +00001539class EncodedFileTest(unittest.TestCase):
1540
1541 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001542 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001543 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001545
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001546 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001547 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001548 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001550
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001551all_unicode_encodings = [
1552 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001553 "big5",
1554 "big5hkscs",
1555 "charmap",
1556 "cp037",
1557 "cp1006",
1558 "cp1026",
1559 "cp1140",
1560 "cp1250",
1561 "cp1251",
1562 "cp1252",
1563 "cp1253",
1564 "cp1254",
1565 "cp1255",
1566 "cp1256",
1567 "cp1257",
1568 "cp1258",
1569 "cp424",
1570 "cp437",
1571 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001572 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001573 "cp737",
1574 "cp775",
1575 "cp850",
1576 "cp852",
1577 "cp855",
1578 "cp856",
1579 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001580 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001581 "cp860",
1582 "cp861",
1583 "cp862",
1584 "cp863",
1585 "cp864",
1586 "cp865",
1587 "cp866",
1588 "cp869",
1589 "cp874",
1590 "cp875",
1591 "cp932",
1592 "cp949",
1593 "cp950",
1594 "euc_jis_2004",
1595 "euc_jisx0213",
1596 "euc_jp",
1597 "euc_kr",
1598 "gb18030",
1599 "gb2312",
1600 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001601 "hp_roman8",
1602 "hz",
1603 "idna",
1604 "iso2022_jp",
1605 "iso2022_jp_1",
1606 "iso2022_jp_2",
1607 "iso2022_jp_2004",
1608 "iso2022_jp_3",
1609 "iso2022_jp_ext",
1610 "iso2022_kr",
1611 "iso8859_1",
1612 "iso8859_10",
1613 "iso8859_11",
1614 "iso8859_13",
1615 "iso8859_14",
1616 "iso8859_15",
1617 "iso8859_16",
1618 "iso8859_2",
1619 "iso8859_3",
1620 "iso8859_4",
1621 "iso8859_5",
1622 "iso8859_6",
1623 "iso8859_7",
1624 "iso8859_8",
1625 "iso8859_9",
1626 "johab",
1627 "koi8_r",
1628 "koi8_u",
1629 "latin_1",
1630 "mac_cyrillic",
1631 "mac_greek",
1632 "mac_iceland",
1633 "mac_latin2",
1634 "mac_roman",
1635 "mac_turkish",
1636 "palmos",
1637 "ptcp154",
1638 "punycode",
1639 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001640 "shift_jis",
1641 "shift_jis_2004",
1642 "shift_jisx0213",
1643 "tis_620",
1644 "unicode_escape",
1645 "unicode_internal",
1646 "utf_16",
1647 "utf_16_be",
1648 "utf_16_le",
1649 "utf_7",
1650 "utf_8",
1651]
1652
1653if hasattr(codecs, "mbcs_encode"):
1654 all_unicode_encodings.append("mbcs")
1655
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001656# The following encoding is not tested, because it's not supposed
1657# to work:
1658# "undefined"
1659
1660# The following encodings don't work in stateful mode
1661broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001662 "punycode",
1663 "unicode_internal"
1664]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001665broken_incremental_coders = broken_unicode_with_streams + [
1666 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001667]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001668
Walter Dörwald3abcb012007-04-16 22:10:50 +00001669class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001670 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001671 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001672 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001673 name = codecs.lookup(encoding).name
1674 if encoding.endswith("_codec"):
1675 name += "_codec"
1676 elif encoding == "latin_1":
1677 name = "latin_1"
1678 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001679
Ezio Melottiadc417c2011-11-17 12:23:34 +02001680 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001681 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001682 (b, size) = codecs.getencoder(encoding)(s)
1683 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1684 (chars, size) = codecs.getdecoder(encoding)(b)
1685 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001686
1687 if encoding not in broken_unicode_with_streams:
1688 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001689 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001690 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001691 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001692 for c in s:
1693 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001694 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001695 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001696 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001697 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001698 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001699 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001700 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001701 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001702 decodedresult += reader.read()
1703 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1704
Thomas Wouters89f507f2006-12-13 04:49:30 +00001705 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001706 # check incremental decoder/encoder (fetched via the Python
1707 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001708 try:
1709 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001710 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001711 except LookupError: # no IncrementalEncoder
1712 pass
1713 else:
1714 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001715 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001716 for c in s:
1717 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001718 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001719 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001720 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001721 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001722 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001723 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001724 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1725
1726 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001727 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001728 for c in s:
1729 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001730 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001731 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001732 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001733 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001734 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001735 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001736 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1737
1738 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001739 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001740 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1741
1742 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001743 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1744 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001745
Victor Stinner554f3f02010-06-16 23:33:54 +00001746 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001747 # check incremental decoder/encoder with errors argument
1748 try:
1749 encoder = codecs.getincrementalencoder(encoding)("ignore")
1750 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1751 except LookupError: # no IncrementalEncoder
1752 pass
1753 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001754 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001755 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001756 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001757 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1758
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001759 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001760 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001761 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001762 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1763
Walter Dörwald729c31f2005-03-14 19:06:30 +00001764 def test_seek(self):
1765 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001766 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001767 for encoding in all_unicode_encodings:
1768 if encoding == "idna": # FIXME: See SF bug #1163178
1769 continue
1770 if encoding in broken_unicode_with_streams:
1771 continue
Victor Stinner05010702011-05-27 16:50:40 +02001772 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001773 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001774 # Test that calling seek resets the internal codec state and buffers
1775 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001776 data = reader.read()
1777 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001778
Walter Dörwalde22d3392005-11-17 08:52:34 +00001779 def test_bad_decode_args(self):
1780 for encoding in all_unicode_encodings:
1781 decoder = codecs.getdecoder(encoding)
1782 self.assertRaises(TypeError, decoder)
1783 if encoding not in ("idna", "punycode"):
1784 self.assertRaises(TypeError, decoder, 42)
1785
1786 def test_bad_encode_args(self):
1787 for encoding in all_unicode_encodings:
1788 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001789 with support.check_warnings():
1790 # unicode-internal has been deprecated
1791 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001792
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001793 def test_encoding_map_type_initialized(self):
1794 from encodings import cp1140
1795 # This used to crash, we are only verifying there's no crash.
1796 table_type = type(cp1140.encoding_table)
1797 self.assertEqual(table_type, table_type)
1798
Walter Dörwald3abcb012007-04-16 22:10:50 +00001799 def test_decoder_state(self):
1800 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001801 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001802 for encoding in all_unicode_encodings:
1803 if encoding not in broken_incremental_coders:
1804 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1805 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1806
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001807class CharmapTest(unittest.TestCase):
1808 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001809 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001810 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001811 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001812 )
1813
Ezio Melottib3aedd42010-11-20 19:04:17 +00001814 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001815 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1816 ("\U0010FFFFbc", 3)
1817 )
1818
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001819 self.assertRaises(UnicodeDecodeError,
1820 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1821 )
1822
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001823 self.assertRaises(UnicodeDecodeError,
1824 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1825 )
1826
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001827 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001828 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001829 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001830 )
1831
Ezio Melottib3aedd42010-11-20 19:04:17 +00001832 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001833 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001834 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001835 )
1836
Ezio Melottib3aedd42010-11-20 19:04:17 +00001837 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001838 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001839 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001840 )
1841
Ezio Melottib3aedd42010-11-20 19:04:17 +00001842 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001843 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001844 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001845 )
1846
Guido van Rossum805365e2007-05-07 22:24:25 +00001847 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001848 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001849 codecs.charmap_decode(allbytes, "ignore", ""),
1850 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001851 )
1852
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001853 def test_decode_with_int2str_map(self):
1854 self.assertEqual(
1855 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1856 {0: 'a', 1: 'b', 2: 'c'}),
1857 ("abc", 3)
1858 )
1859
1860 self.assertEqual(
1861 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1862 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1863 ("AaBbCc", 3)
1864 )
1865
1866 self.assertEqual(
1867 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1868 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1869 ("\U0010FFFFbc", 3)
1870 )
1871
1872 self.assertEqual(
1873 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1874 {0: 'a', 1: 'b', 2: ''}),
1875 ("ab", 3)
1876 )
1877
1878 self.assertRaises(UnicodeDecodeError,
1879 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1880 {0: 'a', 1: 'b'}
1881 )
1882
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001883 self.assertRaises(UnicodeDecodeError,
1884 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1885 {0: 'a', 1: 'b', 2: None}
1886 )
1887
1888 # Issue #14850
1889 self.assertRaises(UnicodeDecodeError,
1890 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1891 {0: 'a', 1: 'b', 2: '\ufffe'}
1892 )
1893
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001894 self.assertEqual(
1895 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1896 {0: 'a', 1: 'b'}),
1897 ("ab\ufffd", 3)
1898 )
1899
1900 self.assertEqual(
1901 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1902 {0: 'a', 1: 'b', 2: None}),
1903 ("ab\ufffd", 3)
1904 )
1905
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001906 # Issue #14850
1907 self.assertEqual(
1908 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1909 {0: 'a', 1: 'b', 2: '\ufffe'}),
1910 ("ab\ufffd", 3)
1911 )
1912
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001913 self.assertEqual(
1914 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1915 {0: 'a', 1: 'b'}),
1916 ("ab", 3)
1917 )
1918
1919 self.assertEqual(
1920 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1921 {0: 'a', 1: 'b', 2: None}),
1922 ("ab", 3)
1923 )
1924
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001925 # Issue #14850
1926 self.assertEqual(
1927 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1928 {0: 'a', 1: 'b', 2: '\ufffe'}),
1929 ("ab", 3)
1930 )
1931
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001932 allbytes = bytes(range(256))
1933 self.assertEqual(
1934 codecs.charmap_decode(allbytes, "ignore", {}),
1935 ("", len(allbytes))
1936 )
1937
1938 def test_decode_with_int2int_map(self):
1939 a = ord('a')
1940 b = ord('b')
1941 c = ord('c')
1942
1943 self.assertEqual(
1944 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1945 {0: a, 1: b, 2: c}),
1946 ("abc", 3)
1947 )
1948
1949 # Issue #15379
1950 self.assertEqual(
1951 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1952 {0: 0x10FFFF, 1: b, 2: c}),
1953 ("\U0010FFFFbc", 3)
1954 )
1955
Antoine Pitroua1f76552012-09-23 20:00:04 +02001956 self.assertEqual(
1957 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1958 {0: sys.maxunicode, 1: b, 2: c}),
1959 (chr(sys.maxunicode) + "bc", 3)
1960 )
1961
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001962 self.assertRaises(TypeError,
1963 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001964 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001965 )
1966
1967 self.assertRaises(UnicodeDecodeError,
1968 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1969 {0: a, 1: b},
1970 )
1971
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001972 self.assertRaises(UnicodeDecodeError,
1973 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1974 {0: a, 1: b, 2: 0xFFFE},
1975 )
1976
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001977 self.assertEqual(
1978 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1979 {0: a, 1: b}),
1980 ("ab\ufffd", 3)
1981 )
1982
1983 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001984 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1985 {0: a, 1: b, 2: 0xFFFE}),
1986 ("ab\ufffd", 3)
1987 )
1988
1989 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001990 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1991 {0: a, 1: b}),
1992 ("ab", 3)
1993 )
1994
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001995 self.assertEqual(
1996 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1997 {0: a, 1: b, 2: 0xFFFE}),
1998 ("ab", 3)
1999 )
2000
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002001
Thomas Wouters89f507f2006-12-13 04:49:30 +00002002class WithStmtTest(unittest.TestCase):
2003 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002004 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002005 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2006 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002007
2008 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002009 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002010 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002011 with codecs.StreamReaderWriter(f, info.streamreader,
2012 info.streamwriter, 'strict') as srw:
2013 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002014
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002015class TypesTest(unittest.TestCase):
2016 def test_decode_unicode(self):
2017 # Most decoders don't accept unicode input
2018 decoders = [
2019 codecs.utf_7_decode,
2020 codecs.utf_8_decode,
2021 codecs.utf_16_le_decode,
2022 codecs.utf_16_be_decode,
2023 codecs.utf_16_ex_decode,
2024 codecs.utf_32_decode,
2025 codecs.utf_32_le_decode,
2026 codecs.utf_32_be_decode,
2027 codecs.utf_32_ex_decode,
2028 codecs.latin_1_decode,
2029 codecs.ascii_decode,
2030 codecs.charmap_decode,
2031 ]
2032 if hasattr(codecs, "mbcs_decode"):
2033 decoders.append(codecs.mbcs_decode)
2034 for decoder in decoders:
2035 self.assertRaises(TypeError, decoder, "xxx")
2036
2037 def test_unicode_escape(self):
2038 # Escape-decoding an unicode string is supported ang gives the same
2039 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002040 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2041 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2042 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2043 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002044
Victor Stinnere3b47152011-12-09 20:49:49 +01002045 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2046 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2047
2048 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2049 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2050
Serhiy Storchakad6793772013-01-29 10:20:44 +02002051
2052class UnicodeEscapeTest(unittest.TestCase):
2053 def test_empty(self):
2054 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2055 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2056
2057 def test_raw_encode(self):
2058 encode = codecs.unicode_escape_encode
2059 for b in range(32, 127):
2060 if b != b'\\'[0]:
2061 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2062
2063 def test_raw_decode(self):
2064 decode = codecs.unicode_escape_decode
2065 for b in range(256):
2066 if b != b'\\'[0]:
2067 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2068
2069 def test_escape_encode(self):
2070 encode = codecs.unicode_escape_encode
2071 check = coding_checker(self, encode)
2072 check('\t', br'\t')
2073 check('\n', br'\n')
2074 check('\r', br'\r')
2075 check('\\', br'\\')
2076 for b in range(32):
2077 if chr(b) not in '\t\n\r':
2078 check(chr(b), ('\\x%02x' % b).encode())
2079 for b in range(127, 256):
2080 check(chr(b), ('\\x%02x' % b).encode())
2081 check('\u20ac', br'\u20ac')
2082 check('\U0001d120', br'\U0001d120')
2083
2084 def test_escape_decode(self):
2085 decode = codecs.unicode_escape_decode
2086 check = coding_checker(self, decode)
2087 check(b"[\\\n]", "[]")
2088 check(br'[\"]', '["]')
2089 check(br"[\']", "[']")
2090 check(br"[\\]", r"[\]")
2091 check(br"[\a]", "[\x07]")
2092 check(br"[\b]", "[\x08]")
2093 check(br"[\t]", "[\x09]")
2094 check(br"[\n]", "[\x0a]")
2095 check(br"[\v]", "[\x0b]")
2096 check(br"[\f]", "[\x0c]")
2097 check(br"[\r]", "[\x0d]")
2098 check(br"[\7]", "[\x07]")
2099 check(br"[\8]", r"[\8]")
2100 check(br"[\78]", "[\x078]")
2101 check(br"[\41]", "[!]")
2102 check(br"[\418]", "[!8]")
2103 check(br"[\101]", "[A]")
2104 check(br"[\1010]", "[A0]")
2105 check(br"[\x41]", "[A]")
2106 check(br"[\x410]", "[A0]")
2107 check(br"\u20ac", "\u20ac")
2108 check(br"\U0001d120", "\U0001d120")
2109 for b in range(256):
2110 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2111 check(b'\\' + bytes([b]), '\\' + chr(b))
2112
2113 def test_decode_errors(self):
2114 decode = codecs.unicode_escape_decode
2115 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2116 for i in range(d):
2117 self.assertRaises(UnicodeDecodeError, decode,
2118 b"\\" + c + b"0"*i)
2119 self.assertRaises(UnicodeDecodeError, decode,
2120 b"[\\" + c + b"0"*i + b"]")
2121 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2122 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2123 self.assertEqual(decode(data, "replace"),
2124 ("[\ufffd]\ufffd", len(data)))
2125 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2126 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2127 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2128
2129
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002130class RawUnicodeEscapeTest(unittest.TestCase):
2131 def test_empty(self):
2132 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2133 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2134
2135 def test_raw_encode(self):
2136 encode = codecs.raw_unicode_escape_encode
2137 for b in range(256):
2138 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2139
2140 def test_raw_decode(self):
2141 decode = codecs.raw_unicode_escape_decode
2142 for b in range(256):
2143 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2144
2145 def test_escape_encode(self):
2146 encode = codecs.raw_unicode_escape_encode
2147 check = coding_checker(self, encode)
2148 for b in range(256):
2149 if b not in b'uU':
2150 check('\\' + chr(b), b'\\' + bytes([b]))
2151 check('\u20ac', br'\u20ac')
2152 check('\U0001d120', br'\U0001d120')
2153
2154 def test_escape_decode(self):
2155 decode = codecs.raw_unicode_escape_decode
2156 check = coding_checker(self, decode)
2157 for b in range(256):
2158 if b not in b'uU':
2159 check(b'\\' + bytes([b]), '\\' + chr(b))
2160 check(br"\u20ac", "\u20ac")
2161 check(br"\U0001d120", "\U0001d120")
2162
2163 def test_decode_errors(self):
2164 decode = codecs.raw_unicode_escape_decode
2165 for c, d in (b'u', 4), (b'U', 4):
2166 for i in range(d):
2167 self.assertRaises(UnicodeDecodeError, decode,
2168 b"\\" + c + b"0"*i)
2169 self.assertRaises(UnicodeDecodeError, decode,
2170 b"[\\" + c + b"0"*i + b"]")
2171 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2172 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2173 self.assertEqual(decode(data, "replace"),
2174 ("[\ufffd]\ufffd", len(data)))
2175 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2176 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2177 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2178
2179
Martin v. Löwis43c57782009-05-10 08:15:24 +00002180class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002181
2182 def test_utf8(self):
2183 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002184 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002185 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002186 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002187 b"foo\x80bar")
2188 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002189 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002190 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002191 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002192 b"\xed\xb0\x80")
2193
2194 def test_ascii(self):
2195 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002196 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002197 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002198 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002199 b"foo\x80bar")
2200
2201 def test_charmap(self):
2202 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002203 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002204 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002205 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002206 b"foo\xa5bar")
2207
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002208 def test_latin1(self):
2209 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002210 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002211 b"\xe4\xeb\xef\xf6\xfc")
2212
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002213
Victor Stinner3fed0872010-05-22 02:16:27 +00002214class BomTest(unittest.TestCase):
2215 def test_seek0(self):
2216 data = "1234567890"
2217 tests = ("utf-16",
2218 "utf-16-le",
2219 "utf-16-be",
2220 "utf-32",
2221 "utf-32-le",
2222 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002223 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002224 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002225 # Check if the BOM is written only once
2226 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002227 f.write(data)
2228 f.write(data)
2229 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002230 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002231 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002232 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002233
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002234 # Check that the BOM is written after a seek(0)
2235 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2236 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002237 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002238 f.seek(0)
2239 f.write(data)
2240 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002241 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002242
2243 # (StreamWriter) Check that the BOM is written after a seek(0)
2244 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002245 f.writer.write(data[0])
2246 self.assertNotEqual(f.writer.tell(), 0)
2247 f.writer.seek(0)
2248 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002249 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002250 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002251
Victor Stinner05010702011-05-27 16:50:40 +02002252 # Check that the BOM is not written after a seek() at a position
2253 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002254 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2255 f.write(data)
2256 f.seek(f.tell())
2257 f.write(data)
2258 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002259 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002260
Victor Stinner05010702011-05-27 16:50:40 +02002261 # (StreamWriter) Check that the BOM is not written after a seek()
2262 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002263 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002264 f.writer.write(data)
2265 f.writer.seek(f.writer.tell())
2266 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002267 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002268 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002269
Victor Stinner3fed0872010-05-22 02:16:27 +00002270
Georg Brandl02524622010-12-02 18:06:51 +00002271bytes_transform_encodings = [
2272 "base64_codec",
2273 "uu_codec",
2274 "quopri_codec",
2275 "hex_codec",
2276]
2277try:
2278 import zlib
2279except ImportError:
2280 pass
2281else:
2282 bytes_transform_encodings.append("zlib_codec")
2283try:
2284 import bz2
2285except ImportError:
2286 pass
2287else:
2288 bytes_transform_encodings.append("bz2_codec")
2289
2290class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002291
Georg Brandl02524622010-12-02 18:06:51 +00002292 def test_basics(self):
2293 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002294 for encoding in bytes_transform_encodings:
2295 # generic codecs interface
2296 (o, size) = codecs.getencoder(encoding)(binput)
2297 self.assertEqual(size, len(binput))
2298 (i, size) = codecs.getdecoder(encoding)(o)
2299 self.assertEqual(size, len(o))
2300 self.assertEqual(i, binput)
2301
Georg Brandl02524622010-12-02 18:06:51 +00002302 def test_read(self):
2303 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002304 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002305 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002306 sout = reader.read()
2307 self.assertEqual(sout, b"\x80")
2308
2309 def test_readline(self):
2310 for encoding in bytes_transform_encodings:
2311 if encoding in ['uu_codec', 'zlib_codec']:
2312 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002313 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002314 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002315 sout = reader.readline()
2316 self.assertEqual(sout, b"\x80")
2317
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002318 def test_buffer_api_usage(self):
2319 # We check all the transform codecs accept memoryview input
2320 # for encoding and decoding
2321 # and also that they roundtrip correctly
2322 original = b"12345\x80"
2323 for encoding in bytes_transform_encodings:
2324 data = original
2325 view = memoryview(data)
2326 data = codecs.encode(data, encoding)
2327 view_encoded = codecs.encode(view, encoding)
2328 self.assertEqual(view_encoded, data)
2329 view = memoryview(data)
2330 data = codecs.decode(data, encoding)
2331 self.assertEqual(data, original)
2332 view_decoded = codecs.decode(view, encoding)
2333 self.assertEqual(view_decoded, data)
2334
2335
Georg Brandl02524622010-12-02 18:06:51 +00002336
Victor Stinner62be4fb2011-10-18 21:46:37 +02002337@unittest.skipUnless(sys.platform == 'win32',
2338 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002339class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002340 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002341 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002342
Victor Stinner3a50e702011-10-18 21:21:00 +02002343 def test_invalid_code_page(self):
2344 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2345 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002346 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2347 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002348
2349 def test_code_page_name(self):
2350 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2351 codecs.code_page_encode, 932, '\xff')
2352 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2353 codecs.code_page_decode, 932, b'\x81\x00')
2354 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2355 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2356
2357 def check_decode(self, cp, tests):
2358 for raw, errors, expected in tests:
2359 if expected is not None:
2360 try:
2361 decoded = codecs.code_page_decode(cp, raw, errors)
2362 except UnicodeDecodeError as err:
2363 self.fail('Unable to decode %a from "cp%s" with '
2364 'errors=%r: %s' % (raw, cp, errors, err))
2365 self.assertEqual(decoded[0], expected,
2366 '%a.decode("cp%s", %r)=%a != %a'
2367 % (raw, cp, errors, decoded[0], expected))
2368 # assert 0 <= decoded[1] <= len(raw)
2369 self.assertGreaterEqual(decoded[1], 0)
2370 self.assertLessEqual(decoded[1], len(raw))
2371 else:
2372 self.assertRaises(UnicodeDecodeError,
2373 codecs.code_page_decode, cp, raw, errors)
2374
2375 def check_encode(self, cp, tests):
2376 for text, errors, expected in tests:
2377 if expected is not None:
2378 try:
2379 encoded = codecs.code_page_encode(cp, text, errors)
2380 except UnicodeEncodeError as err:
2381 self.fail('Unable to encode %a to "cp%s" with '
2382 'errors=%r: %s' % (text, cp, errors, err))
2383 self.assertEqual(encoded[0], expected,
2384 '%a.encode("cp%s", %r)=%a != %a'
2385 % (text, cp, errors, encoded[0], expected))
2386 self.assertEqual(encoded[1], len(text))
2387 else:
2388 self.assertRaises(UnicodeEncodeError,
2389 codecs.code_page_encode, cp, text, errors)
2390
2391 def test_cp932(self):
2392 self.check_encode(932, (
2393 ('abc', 'strict', b'abc'),
2394 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002395 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002396 ('\xff', 'strict', None),
2397 ('[\xff]', 'ignore', b'[]'),
2398 ('[\xff]', 'replace', b'[y]'),
2399 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002400 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2401 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002402 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002403 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002404 (b'abc', 'strict', 'abc'),
2405 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2406 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002407 (b'[\xff]', 'strict', None),
2408 (b'[\xff]', 'ignore', '[]'),
2409 (b'[\xff]', 'replace', '[\ufffd]'),
2410 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002411 (b'\x81\x00abc', 'strict', None),
2412 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002413 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2414 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002415
2416 def test_cp1252(self):
2417 self.check_encode(1252, (
2418 ('abc', 'strict', b'abc'),
2419 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2420 ('\xff', 'strict', b'\xff'),
2421 ('\u0141', 'strict', None),
2422 ('\u0141', 'ignore', b''),
2423 ('\u0141', 'replace', b'L'),
2424 ))
2425 self.check_decode(1252, (
2426 (b'abc', 'strict', 'abc'),
2427 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2428 (b'\xff', 'strict', '\xff'),
2429 ))
2430
2431 def test_cp_utf7(self):
2432 cp = 65000
2433 self.check_encode(cp, (
2434 ('abc', 'strict', b'abc'),
2435 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2436 ('\U0010ffff', 'strict', b'+2//f/w-'),
2437 ('\udc80', 'strict', b'+3IA-'),
2438 ('\ufffd', 'strict', b'+//0-'),
2439 ))
2440 self.check_decode(cp, (
2441 (b'abc', 'strict', 'abc'),
2442 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2443 (b'+2//f/w-', 'strict', '\U0010ffff'),
2444 (b'+3IA-', 'strict', '\udc80'),
2445 (b'+//0-', 'strict', '\ufffd'),
2446 # invalid bytes
2447 (b'[+/]', 'strict', '[]'),
2448 (b'[\xff]', 'strict', '[\xff]'),
2449 ))
2450
Victor Stinner3a50e702011-10-18 21:21:00 +02002451 def test_multibyte_encoding(self):
2452 self.check_decode(932, (
2453 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2454 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2455 ))
2456 self.check_decode(self.CP_UTF8, (
2457 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2458 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2459 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002460 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002461 self.check_encode(self.CP_UTF8, (
2462 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2463 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2464 ))
2465
2466 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002467 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2468 self.assertEqual(decoded, ('', 0))
2469
Victor Stinner3a50e702011-10-18 21:21:00 +02002470 decoded = codecs.code_page_decode(932,
2471 b'\xe9\x80\xe9', 'strict',
2472 False)
2473 self.assertEqual(decoded, ('\u9a3e', 2))
2474
2475 decoded = codecs.code_page_decode(932,
2476 b'\xe9\x80\xe9\x80', 'strict',
2477 False)
2478 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2479
2480 decoded = codecs.code_page_decode(932,
2481 b'abc', 'strict',
2482 False)
2483 self.assertEqual(decoded, ('abc', 3))
2484
2485
Fred Drake2e2be372001-09-20 21:33:42 +00002486if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002487 unittest.main()