blob: a8b3da0f370b5a70e203b4179c51ed6f8771cdfb [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01002import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02003import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01004import sys
5import unittest
6import warnings
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02007import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01008
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Serhiy Storchakad6793772013-01-29 10:20:44 +020024def coding_checker(self, coder):
25 def check(input, expect):
26 self.assertEqual(coder(input), (expect, len(input)))
27 return check
28
Walter Dörwald69652032004-09-07 20:24:22 +000029class Queue(object):
30 """
31 queue: write bytes at one end, read bytes from the other end
32 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000033 def __init__(self, buffer):
34 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000035
36 def write(self, chars):
37 self._buffer += chars
38
39 def read(self, size=-1):
40 if size<0:
41 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000043 return s
44 else:
45 s = self._buffer[:size]
46 self._buffer = self._buffer[size:]
47 return s
48
Walter Dörwald3abcb012007-04-16 22:10:50 +000049class MixInCheckStateHandling:
50 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementaldecoder(encoding)()
53 part1 = d.decode(s[:i])
54 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000055 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000056 # Check that the condition stated in the documentation for
57 # IncrementalDecoder.getstate() holds
58 if not state[1]:
59 # reset decoder to the default state without anything buffered
60 d.setstate((state[0][:0], 0))
61 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000062 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000063 # The decoder must return to the same state
64 self.assertEqual(state, d.getstate())
65 # Create a new decoder and set it to the state
66 # we extracted from the old one
67 d = codecs.getincrementaldecoder(encoding)()
68 d.setstate(state)
69 part2 = d.decode(s[i:], True)
70 self.assertEqual(u, part1+part2)
71
72 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000073 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000074 d = codecs.getincrementalencoder(encoding)()
75 part1 = d.encode(u[:i])
76 state = d.getstate()
77 d = codecs.getincrementalencoder(encoding)()
78 d.setstate(state)
79 part2 = d.encode(u[i:], True)
80 self.assertEqual(s, part1+part2)
81
Ezio Melotti5d3dba02013-01-11 06:02:07 +020082class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000084 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000085 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000086 # the StreamReader and check that the results equal the appropriate
87 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020089 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000090 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000093 result += r.read()
94 self.assertEqual(result, partialresult)
95 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000097 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000098
Thomas Woutersa9773292006-04-21 09:43:23 +000099 # do the check again, this time using a incremental decoder
100 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000101 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000103 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 self.assertEqual(result, partialresult)
105 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 self.assertEqual(d.decode(b"", True), "")
107 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000108
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000109 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000111 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 self.assertEqual(result, partialresult)
115 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000116 self.assertEqual(d.decode(b"", True), "")
117 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000118
119 # check iterdecode()
120 encoded = input.encode(self.encoding)
121 self.assertEqual(
122 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000123 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000124 )
125
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126 def test_readline(self):
127 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000128 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129 return codecs.getreader(self.encoding)(stream)
130
Walter Dörwaldca199432006-03-06 22:39:12 +0000131 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200132 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 lines = []
134 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 if not line:
137 break
138 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000141 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
142 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
143 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000144 self.assertEqual(readalllines(s, True), sexpected)
145 self.assertEqual(readalllines(s, False), sexpectednoends)
146 self.assertEqual(readalllines(s, True, 10), sexpected)
147 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000148
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200149 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 # Test long lines (multiple calls to read() in readline())
151 vw = []
152 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200153 for (i, lineend) in enumerate(lineends):
154 vw.append((i*200+200)*"\u3042" + lineend)
155 vwo.append((i*200+200)*"\u3042")
156 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
157 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000158
159 # Test lines where the first read might end with \r, so the
160 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000161 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200162 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000163 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200170 self.assertEqual(
171 reader.readline(keepends=True),
172 "xxx\n",
173 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000174 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000175 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000176 self.assertEqual(
177 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000178 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000179 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200180 self.assertEqual(
181 reader.readline(keepends=False),
182 "xxx",
183 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000184
Serhiy Storchaka80038502014-01-26 19:21:00 +0200185 def test_mixed_readline_and_read(self):
186 lines = ["Humpty Dumpty sat on a wall,\n",
187 "Humpty Dumpty had a great fall.\r\n",
188 "All the king's horses and all the king's men\r",
189 "Couldn't put Humpty together again."]
190 data = ''.join(lines)
191 def getreader():
192 stream = io.BytesIO(data.encode(self.encoding))
193 return codecs.getreader(self.encoding)(stream)
194
195 # Issue #8260: Test readline() followed by read()
196 f = getreader()
197 self.assertEqual(f.readline(), lines[0])
198 self.assertEqual(f.read(), ''.join(lines[1:]))
199 self.assertEqual(f.read(), '')
200
201 # Issue #16636: Test readline() followed by readlines()
202 f = getreader()
203 self.assertEqual(f.readline(), lines[0])
204 self.assertEqual(f.readlines(), lines[1:])
205 self.assertEqual(f.read(), '')
206
207 # Test read() followed by read()
208 f = getreader()
209 self.assertEqual(f.read(size=40, chars=5), data[:5])
210 self.assertEqual(f.read(), data[5:])
211 self.assertEqual(f.read(), '')
212
213 # Issue #12446: Test read() followed by readlines()
214 f = getreader()
215 self.assertEqual(f.read(size=40, chars=5), data[:5])
216 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
217 self.assertEqual(f.read(), '')
218
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 def test_bug1175396(self):
220 s = [
221 '<%!--===================================================\r\n',
222 ' BLOG index page: show recent articles,\r\n',
223 ' today\'s articles, or articles of a specific date.\r\n',
224 '========================================================--%>\r\n',
225 '<%@inputencoding="ISO-8859-1"%>\r\n',
226 '<%@pagetemplate=TEMPLATE.y%>\r\n',
227 '<%@import=import frog.util, frog%>\r\n',
228 '<%@import=import frog.objects%>\r\n',
229 '<%@import=from frog.storageerrors import StorageError%>\r\n',
230 '<%\r\n',
231 '\r\n',
232 'import logging\r\n',
233 'log=logging.getLogger("Snakelets.logger")\r\n',
234 '\r\n',
235 '\r\n',
236 'user=self.SessionCtx.user\r\n',
237 'storageEngine=self.SessionCtx.storageEngine\r\n',
238 '\r\n',
239 '\r\n',
240 'def readArticlesFromDate(date, count=None):\r\n',
241 ' entryids=storageEngine.listBlogEntries(date)\r\n',
242 ' entryids.reverse() # descending\r\n',
243 ' if count:\r\n',
244 ' entryids=entryids[:count]\r\n',
245 ' try:\r\n',
246 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
247 ' except StorageError,x:\r\n',
248 ' log.error("Error loading articles: "+str(x))\r\n',
249 ' self.abort("cannot load articles")\r\n',
250 '\r\n',
251 'showdate=None\r\n',
252 '\r\n',
253 'arg=self.Request.getArg()\r\n',
254 'if arg=="today":\r\n',
255 ' #-------------------- TODAY\'S ARTICLES\r\n',
256 ' self.write("<h2>Today\'s articles</h2>")\r\n',
257 ' showdate = frog.util.isodatestr() \r\n',
258 ' entries = readArticlesFromDate(showdate)\r\n',
259 'elif arg=="active":\r\n',
260 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
261 ' self.Yredirect("active.y")\r\n',
262 'elif arg=="login":\r\n',
263 ' #-------------------- LOGIN PAGE redirect\r\n',
264 ' self.Yredirect("login.y")\r\n',
265 'elif arg=="date":\r\n',
266 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
267 ' showdate = self.Request.getParameter("date")\r\n',
268 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
269 ' entries = readArticlesFromDate(showdate)\r\n',
270 'else:\r\n',
271 ' #-------------------- RECENT ARTICLES\r\n',
272 ' self.write("<h2>Recent articles</h2>")\r\n',
273 ' dates=storageEngine.listBlogEntryDates()\r\n',
274 ' if dates:\r\n',
275 ' entries=[]\r\n',
276 ' SHOWAMOUNT=10\r\n',
277 ' for showdate in dates:\r\n',
278 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
279 ' if len(entries)>=SHOWAMOUNT:\r\n',
280 ' break\r\n',
281 ' \r\n',
282 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000283 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200284 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000285 for (i, line) in enumerate(reader):
286 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000287
288 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000289 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200290 writer = codecs.getwriter(self.encoding)(q)
291 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000292
293 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000294 writer.write("foo\r")
295 self.assertEqual(reader.readline(keepends=False), "foo")
296 writer.write("\nbar\r")
297 self.assertEqual(reader.readline(keepends=False), "")
298 self.assertEqual(reader.readline(keepends=False), "bar")
299 writer.write("baz")
300 self.assertEqual(reader.readline(keepends=False), "baz")
301 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000302
303 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000304 writer.write("foo\r")
305 self.assertEqual(reader.readline(keepends=True), "foo\r")
306 writer.write("\nbar\r")
307 self.assertEqual(reader.readline(keepends=True), "\n")
308 self.assertEqual(reader.readline(keepends=True), "bar\r")
309 writer.write("baz")
310 self.assertEqual(reader.readline(keepends=True), "baz")
311 self.assertEqual(reader.readline(keepends=True), "")
312 writer.write("foo\r\n")
313 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000314
Walter Dörwald9fa09462005-01-10 12:01:39 +0000315 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000316 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
317 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
318 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000319
320 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000321 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200322 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000323 self.assertEqual(reader.readline(), s1)
324 self.assertEqual(reader.readline(), s2)
325 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000326 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327
328 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
330 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
331 s3 = "stillokay:bbbbxx\r\n"
332 s4 = "broken!!!!badbad\r\n"
333 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000334
335 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000336 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200337 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000338 self.assertEqual(reader.readline(), s1)
339 self.assertEqual(reader.readline(), s2)
340 self.assertEqual(reader.readline(), s3)
341 self.assertEqual(reader.readline(), s4)
342 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000343 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000344
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200345class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000346 encoding = "utf-32"
347
348 spamle = (b'\xff\xfe\x00\x00'
349 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
350 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
351 spambe = (b'\x00\x00\xfe\xff'
352 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
353 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
354
355 def test_only_one_bom(self):
356 _,_,reader,writer = codecs.lookup(self.encoding)
357 # encode some stream
358 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200359 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000360 f.write("spam")
361 f.write("spam")
362 d = s.getvalue()
363 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000364 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000365 # try to read it back
366 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200367 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000368 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000369
370 def test_badbom(self):
371 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200372 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000373 self.assertRaises(UnicodeError, f.read)
374
375 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200376 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000377 self.assertRaises(UnicodeError, f.read)
378
379 def test_partial(self):
380 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200381 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000382 [
383 "", # first byte of BOM read
384 "", # second byte of BOM read
385 "", # third byte of BOM read
386 "", # fourth byte of BOM read => byteorder known
387 "",
388 "",
389 "",
390 "\x00",
391 "\x00",
392 "\x00",
393 "\x00",
394 "\x00\xff",
395 "\x00\xff",
396 "\x00\xff",
397 "\x00\xff",
398 "\x00\xff\u0100",
399 "\x00\xff\u0100",
400 "\x00\xff\u0100",
401 "\x00\xff\u0100",
402 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200403 "\x00\xff\u0100\uffff",
404 "\x00\xff\u0100\uffff",
405 "\x00\xff\u0100\uffff",
406 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000407 ]
408 )
409
Georg Brandl791f4e12009-09-17 11:41:24 +0000410 def test_handlers(self):
411 self.assertEqual(('\ufffd', 1),
412 codecs.utf_32_decode(b'\x01', 'replace', True))
413 self.assertEqual(('', 1),
414 codecs.utf_32_decode(b'\x01', 'ignore', True))
415
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416 def test_errors(self):
417 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
418 b"\xff", "strict", True)
419
420 def test_decoder_state(self):
421 self.check_state_handling_decode(self.encoding,
422 "spamspam", self.spamle)
423 self.check_state_handling_decode(self.encoding,
424 "spamspam", self.spambe)
425
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000426 def test_issue8941(self):
427 # Issue #8941: insufficient result allocation when decoding into
428 # surrogate pairs on UCS-2 builds.
429 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
430 self.assertEqual('\U00010000' * 1024,
431 codecs.utf_32_decode(encoded_le)[0])
432 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
433 self.assertEqual('\U00010000' * 1024,
434 codecs.utf_32_decode(encoded_be)[0])
435
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200436class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000437 encoding = "utf-32-le"
438
439 def test_partial(self):
440 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200441 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 [
443 "",
444 "",
445 "",
446 "\x00",
447 "\x00",
448 "\x00",
449 "\x00",
450 "\x00\xff",
451 "\x00\xff",
452 "\x00\xff",
453 "\x00\xff",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100",
458 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200459 "\x00\xff\u0100\uffff",
460 "\x00\xff\u0100\uffff",
461 "\x00\xff\u0100\uffff",
462 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000463 ]
464 )
465
466 def test_simple(self):
467 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
468
469 def test_errors(self):
470 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
471 b"\xff", "strict", True)
472
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000473 def test_issue8941(self):
474 # Issue #8941: insufficient result allocation when decoding into
475 # surrogate pairs on UCS-2 builds.
476 encoded = b'\x00\x00\x01\x00' * 1024
477 self.assertEqual('\U00010000' * 1024,
478 codecs.utf_32_le_decode(encoded)[0])
479
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200480class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000481 encoding = "utf-32-be"
482
483 def test_partial(self):
484 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200485 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000486 [
487 "",
488 "",
489 "",
490 "\x00",
491 "\x00",
492 "\x00",
493 "\x00",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff",
497 "\x00\xff",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000507 ]
508 )
509
510 def test_simple(self):
511 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
512
513 def test_errors(self):
514 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
515 b"\xff", "strict", True)
516
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000517 def test_issue8941(self):
518 # Issue #8941: insufficient result allocation when decoding into
519 # surrogate pairs on UCS-2 builds.
520 encoded = b'\x00\x01\x00\x00' * 1024
521 self.assertEqual('\U00010000' * 1024,
522 codecs.utf_32_be_decode(encoded)[0])
523
524
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200525class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000526 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000527
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000528 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
529 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000530
531 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000532 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000533 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000534 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200535 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000536 f.write("spam")
537 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000538 d = s.getvalue()
539 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000540 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000541 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000542 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200543 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000544 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000545
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000546 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000547 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200548 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000549 self.assertRaises(UnicodeError, f.read)
550
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000551 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200552 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000553 self.assertRaises(UnicodeError, f.read)
554
Walter Dörwald69652032004-09-07 20:24:22 +0000555 def test_partial(self):
556 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200557 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000558 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000559 "", # first byte of BOM read
560 "", # second byte of BOM read => byteorder known
561 "",
562 "\x00",
563 "\x00",
564 "\x00\xff",
565 "\x00\xff",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100",
568 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200569 "\x00\xff\u0100\uffff",
570 "\x00\xff\u0100\uffff",
571 "\x00\xff\u0100\uffff",
572 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000573 ]
574 )
575
Georg Brandl791f4e12009-09-17 11:41:24 +0000576 def test_handlers(self):
577 self.assertEqual(('\ufffd', 1),
578 codecs.utf_16_decode(b'\x01', 'replace', True))
579 self.assertEqual(('', 1),
580 codecs.utf_16_decode(b'\x01', 'ignore', True))
581
Walter Dörwalde22d3392005-11-17 08:52:34 +0000582 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000583 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000584 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000585
586 def test_decoder_state(self):
587 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000588 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000589 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000591
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000592 def test_bug691291(self):
593 # Files are always opened in binary mode, even if no binary mode was
594 # specified. This means that no automatic conversion of '\n' is done
595 # on reading and writing.
596 s1 = 'Hello\r\nworld\r\n'
597
598 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200599 self.addCleanup(support.unlink, support.TESTFN)
600 with open(support.TESTFN, 'wb') as fp:
601 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200602 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200603 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000604
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200605class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000606 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000607
608 def test_partial(self):
609 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200610 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000611 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200628 tests = [
629 (b'\xff', '\ufffd'),
630 (b'A\x00Z', 'A\ufffd'),
631 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
632 (b'\x00\xd8', '\ufffd'),
633 (b'\x00\xd8A', '\ufffd'),
634 (b'\x00\xd8A\x00', '\ufffdA'),
635 (b'\x00\xdcA\x00', '\ufffdA'),
636 ]
637 for raw, expected in tests:
638 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
639 raw, 'strict', True)
640 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000641
Victor Stinner53a9dd72010-12-08 22:25:45 +0000642 def test_nonbmp(self):
643 self.assertEqual("\U00010203".encode(self.encoding),
644 b'\x00\xd8\x03\xde')
645 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
646 "\U00010203")
647
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200648class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000649 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000650
651 def test_partial(self):
652 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200653 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000654 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000655 "",
656 "\x00",
657 "\x00",
658 "\x00\xff",
659 "\x00\xff",
660 "\x00\xff\u0100",
661 "\x00\xff\u0100",
662 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200663 "\x00\xff\u0100\uffff",
664 "\x00\xff\u0100\uffff",
665 "\x00\xff\u0100\uffff",
666 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000667 ]
668 )
669
Walter Dörwalde22d3392005-11-17 08:52:34 +0000670 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200671 tests = [
672 (b'\xff', '\ufffd'),
673 (b'\x00A\xff', 'A\ufffd'),
674 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
675 (b'\xd8\x00', '\ufffd'),
676 (b'\xd8\x00\xdc', '\ufffd'),
677 (b'\xd8\x00\x00A', '\ufffdA'),
678 (b'\xdc\x00\x00A', '\ufffdA'),
679 ]
680 for raw, expected in tests:
681 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
682 raw, 'strict', True)
683 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000684
Victor Stinner53a9dd72010-12-08 22:25:45 +0000685 def test_nonbmp(self):
686 self.assertEqual("\U00010203".encode(self.encoding),
687 b'\xd8\x00\xde\x03')
688 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
689 "\U00010203")
690
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200691class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000692 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000693
694 def test_partial(self):
695 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200696 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000697 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000698 "\x00",
699 "\x00",
700 "\x00\xff",
701 "\x00\xff",
702 "\x00\xff\u07ff",
703 "\x00\xff\u07ff",
704 "\x00\xff\u07ff",
705 "\x00\xff\u07ff\u0800",
706 "\x00\xff\u07ff\u0800",
707 "\x00\xff\u07ff\u0800",
708 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200709 "\x00\xff\u07ff\u0800\uffff",
710 "\x00\xff\u07ff\u0800\uffff",
711 "\x00\xff\u07ff\u0800\uffff",
712 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000713 ]
714 )
715
Walter Dörwald3abcb012007-04-16 22:10:50 +0000716 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000717 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000718 self.check_state_handling_decode(self.encoding,
719 u, u.encode(self.encoding))
720
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000721 def test_lone_surrogates(self):
722 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
723 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000724 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
725 b'[\\udc80]')
726 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
727 b'[&#56448;]')
728 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
729 b'[\x80]')
730 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
731 b'[]')
732 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
733 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000734
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000735 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000736 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
737 b"abc\xed\xa0\x80def")
738 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
739 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
741 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
742 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
743 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000744 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700745 with self.assertRaises(UnicodeDecodeError):
746 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200747 with self.assertRaises(UnicodeDecodeError):
748 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000749
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200750@unittest.skipUnless(sys.platform == 'win32',
751 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200752class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200753 encoding = "cp65001"
754
755 def test_encode(self):
756 tests = [
757 ('abc', 'strict', b'abc'),
758 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
759 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
760 ]
761 if VISTA_OR_LATER:
762 tests.extend((
763 ('\udc80', 'strict', None),
764 ('\udc80', 'ignore', b''),
765 ('\udc80', 'replace', b'?'),
766 ('\udc80', 'backslashreplace', b'\\udc80'),
767 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
768 ))
769 else:
770 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
771 for text, errors, expected in tests:
772 if expected is not None:
773 try:
774 encoded = text.encode('cp65001', errors)
775 except UnicodeEncodeError as err:
776 self.fail('Unable to encode %a to cp65001 with '
777 'errors=%r: %s' % (text, errors, err))
778 self.assertEqual(encoded, expected,
779 '%a.encode("cp65001", %r)=%a != %a'
780 % (text, errors, encoded, expected))
781 else:
782 self.assertRaises(UnicodeEncodeError,
783 text.encode, "cp65001", errors)
784
785 def test_decode(self):
786 tests = [
787 (b'abc', 'strict', 'abc'),
788 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
789 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
790 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
791 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
792 # invalid bytes
793 (b'[\xff]', 'strict', None),
794 (b'[\xff]', 'ignore', '[]'),
795 (b'[\xff]', 'replace', '[\ufffd]'),
796 (b'[\xff]', 'surrogateescape', '[\udcff]'),
797 ]
798 if VISTA_OR_LATER:
799 tests.extend((
800 (b'[\xed\xb2\x80]', 'strict', None),
801 (b'[\xed\xb2\x80]', 'ignore', '[]'),
802 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
803 ))
804 else:
805 tests.extend((
806 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
807 ))
808 for raw, errors, expected in tests:
809 if expected is not None:
810 try:
811 decoded = raw.decode('cp65001', errors)
812 except UnicodeDecodeError as err:
813 self.fail('Unable to decode %a from cp65001 with '
814 'errors=%r: %s' % (raw, errors, err))
815 self.assertEqual(decoded, expected,
816 '%a.decode("cp65001", %r)=%a != %a'
817 % (raw, errors, decoded, expected))
818 else:
819 self.assertRaises(UnicodeDecodeError,
820 raw.decode, 'cp65001', errors)
821
822 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
823 def test_lone_surrogates(self):
824 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
825 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
826 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
827 b'[\\udc80]')
828 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
829 b'[&#56448;]')
830 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
831 b'[\x80]')
832 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
833 b'[]')
834 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
835 b'[?]')
836
837 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
838 def test_surrogatepass_handler(self):
839 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
840 b"abc\xed\xa0\x80def")
841 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
842 "abc\ud800def")
843 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
844 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
845 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
846 "\U00010fff\uD800")
847 self.assertTrue(codecs.lookup_error("surrogatepass"))
848
Victor Stinner1be39e52014-02-09 13:11:53 +0100849 def test_readline(self):
850 self.skipTest("issue #20571: code page 65001 codec does not "
851 "support partial decoder yet")
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200852
853
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200854class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000855 encoding = "utf-7"
856
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000857 def test_partial(self):
858 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200859 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000860 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200861 'a',
862 'a',
863 'a+',
864 'a+-',
865 'a+-b',
866 'a+-b',
867 'a+-b',
868 'a+-b',
869 'a+-b',
870 'a+-b\x00',
871 'a+-b\x00c',
872 'a+-b\x00c',
873 'a+-b\x00c',
874 'a+-b\x00c',
875 'a+-b\x00c',
876 'a+-b\x00c\x80',
877 'a+-b\x00c\x80d',
878 'a+-b\x00c\x80d',
879 'a+-b\x00c\x80d',
880 'a+-b\x00c\x80d',
881 'a+-b\x00c\x80d',
882 'a+-b\x00c\x80d\u0100',
883 'a+-b\x00c\x80d\u0100e',
884 'a+-b\x00c\x80d\u0100e',
885 'a+-b\x00c\x80d\u0100e',
886 'a+-b\x00c\x80d\u0100e',
887 'a+-b\x00c\x80d\u0100e',
888 'a+-b\x00c\x80d\u0100e',
889 'a+-b\x00c\x80d\u0100e',
890 'a+-b\x00c\x80d\u0100e',
891 'a+-b\x00c\x80d\u0100e\U00010000',
892 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000893 ]
894 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000895
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300896 def test_errors(self):
897 tests = [
898 (b'a\xffb', 'a\ufffdb'),
899 (b'a+IK', 'a\ufffd'),
900 (b'a+IK-b', 'a\ufffdb'),
901 (b'a+IK,b', 'a\ufffdb'),
902 (b'a+IKx', 'a\u20ac\ufffd'),
903 (b'a+IKx-b', 'a\u20ac\ufffdb'),
904 (b'a+IKwgr', 'a\u20ac\ufffd'),
905 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
906 (b'a+IKwgr,', 'a\u20ac\ufffd'),
907 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
908 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
909 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
910 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
911 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
912 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
913 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
914 ]
915 for raw, expected in tests:
Serhiy Storchaka0e071c92013-10-19 21:14:57 +0300916 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
917 raw, 'strict', True)
918 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300919
920 def test_nonbmp(self):
921 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
922 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
923 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
924
Walter Dörwalde22d3392005-11-17 08:52:34 +0000925class UTF16ExTest(unittest.TestCase):
926
927 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000928 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000929
930 def test_bad_args(self):
931 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
932
933class ReadBufferTest(unittest.TestCase):
934
935 def test_array(self):
936 import array
937 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000938 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000939 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000940 )
941
942 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000943 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000944
945 def test_bad_args(self):
946 self.assertRaises(TypeError, codecs.readbuffer_encode)
947 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
948
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200949class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000950 encoding = "utf-8-sig"
951
952 def test_partial(self):
953 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200954 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000955 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000956 "",
957 "",
958 "", # First BOM has been read and skipped
959 "",
960 "",
961 "\ufeff", # Second BOM has been read and emitted
962 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000963 "\ufeff\x00", # First byte of encoded "\xff" read
964 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
965 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
966 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000967 "\ufeff\x00\xff\u07ff",
968 "\ufeff\x00\xff\u07ff",
969 "\ufeff\x00\xff\u07ff\u0800",
970 "\ufeff\x00\xff\u07ff\u0800",
971 "\ufeff\x00\xff\u07ff\u0800",
972 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200973 "\ufeff\x00\xff\u07ff\u0800\uffff",
974 "\ufeff\x00\xff\u07ff\u0800\uffff",
975 "\ufeff\x00\xff\u07ff\u0800\uffff",
976 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000977 ]
978 )
979
Thomas Wouters89f507f2006-12-13 04:49:30 +0000980 def test_bug1601501(self):
981 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000982 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000983
Walter Dörwald3abcb012007-04-16 22:10:50 +0000984 def test_bom(self):
985 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000986 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000987 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
988
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000989 def test_stream_bom(self):
990 unistring = "ABC\u00A1\u2200XYZ"
991 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
992
993 reader = codecs.getreader("utf-8-sig")
994 for sizehint in [None] + list(range(1, 11)) + \
995 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200996 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000997 ostream = io.StringIO()
998 while 1:
999 if sizehint is not None:
1000 data = istream.read(sizehint)
1001 else:
1002 data = istream.read()
1003
1004 if not data:
1005 break
1006 ostream.write(data)
1007
1008 got = ostream.getvalue()
1009 self.assertEqual(got, unistring)
1010
1011 def test_stream_bare(self):
1012 unistring = "ABC\u00A1\u2200XYZ"
1013 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1014
1015 reader = codecs.getreader("utf-8-sig")
1016 for sizehint in [None] + list(range(1, 11)) + \
1017 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001018 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001019 ostream = io.StringIO()
1020 while 1:
1021 if sizehint is not None:
1022 data = istream.read(sizehint)
1023 else:
1024 data = istream.read()
1025
1026 if not data:
1027 break
1028 ostream.write(data)
1029
1030 got = ostream.getvalue()
1031 self.assertEqual(got, unistring)
1032
1033class EscapeDecodeTest(unittest.TestCase):
1034 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001035 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001036
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001037 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001038 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001039 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001040 b = bytes([b])
1041 if b != b'\\':
1042 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001043
1044 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001045 decode = codecs.escape_decode
1046 check = coding_checker(self, decode)
1047 check(b"[\\\n]", b"[]")
1048 check(br'[\"]', b'["]')
1049 check(br"[\']", b"[']")
1050 check(br"[\\]", br"[\]")
1051 check(br"[\a]", b"[\x07]")
1052 check(br"[\b]", b"[\x08]")
1053 check(br"[\t]", b"[\x09]")
1054 check(br"[\n]", b"[\x0a]")
1055 check(br"[\v]", b"[\x0b]")
1056 check(br"[\f]", b"[\x0c]")
1057 check(br"[\r]", b"[\x0d]")
1058 check(br"[\7]", b"[\x07]")
1059 check(br"[\8]", br"[\8]")
1060 check(br"[\78]", b"[\x078]")
1061 check(br"[\41]", b"[!]")
1062 check(br"[\418]", b"[!8]")
1063 check(br"[\101]", b"[A]")
1064 check(br"[\1010]", b"[A0]")
1065 check(br"[\501]", b"[A]")
1066 check(br"[\x41]", b"[A]")
1067 check(br"[\X41]", br"[\X41]")
1068 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001069 for b in range(256):
1070 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001071 b = bytes([b])
1072 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001073
1074 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001075 decode = codecs.escape_decode
1076 self.assertRaises(ValueError, decode, br"\x")
1077 self.assertRaises(ValueError, decode, br"[\x]")
1078 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1079 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1080 self.assertRaises(ValueError, decode, br"\x0")
1081 self.assertRaises(ValueError, decode, br"[\x0]")
1082 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1083 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001084
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001085class RecodingTest(unittest.TestCase):
1086 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001087 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001088 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001089 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001090 f2.close()
1091 # Python used to crash on this at exit because of a refcount
1092 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001093
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094# From RFC 3492
1095punycode_testcases = [
1096 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001097 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1098 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001099 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001101 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001102 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001103 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001104 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001105 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001106 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001107 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1108 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1109 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001110 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001111 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001112 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1113 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1114 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001115 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001117 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001118 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1119 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1120 "\u0939\u0948\u0902",
1121 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122
1123 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001124 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001125 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1126 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001127
1128 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001129 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1130 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1131 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001132 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1133 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001134
1135 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001136 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1137 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1138 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1139 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001140 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141
1142 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001143 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1144 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1145 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1146 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1147 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001148 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149
1150 # (K) Vietnamese:
1151 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1152 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1154 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1155 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1156 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001157 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001158
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001161 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001162
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001164 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1165 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1166 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001167 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001168
1169 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001170 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1171 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1172 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001173 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001174
1175 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001176 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001177 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
1179 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1181 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001182 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183
1184 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001186 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187
1188 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001189 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001190 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191
1192 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001193 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1194 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001195 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196 ]
1197
1198for i in punycode_testcases:
1199 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001200 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001201
1202class PunycodeTest(unittest.TestCase):
1203 def test_encode(self):
1204 for uni, puny in punycode_testcases:
1205 # Need to convert both strings to lower case, since
1206 # some of the extended encodings use upper case, but our
1207 # code produces only lower case. Converting just puny to
1208 # lower is also insufficient, since some of the input characters
1209 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001210 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001211 str(uni.encode("punycode"), "ascii").lower(),
1212 str(puny, "ascii").lower()
1213 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214
1215 def test_decode(self):
1216 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001217 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001218 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001219 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001220
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001221class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001222 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001223 def test_bug1251300(self):
1224 # Decoding with unicode_internal used to not correctly handle "code
1225 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001226 ok = [
1227 (b"\x00\x10\xff\xff", "\U0010ffff"),
1228 (b"\x00\x00\x01\x01", "\U00000101"),
1229 (b"", ""),
1230 ]
1231 not_ok = [
1232 b"\x7f\xff\xff\xff",
1233 b"\x80\x00\x00\x00",
1234 b"\x81\x00\x00\x00",
1235 b"\x00",
1236 b"\x00\x00\x00\x00\x00",
1237 ]
1238 for internal, uni in ok:
1239 if sys.byteorder == "little":
1240 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001241 with support.check_warnings():
1242 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001243 for internal in not_ok:
1244 if sys.byteorder == "little":
1245 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001246 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001247 'deprecated', DeprecationWarning)):
1248 self.assertRaises(UnicodeDecodeError, internal.decode,
1249 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001250 if sys.byteorder == "little":
1251 invalid = b"\x00\x00\x11\x00"
1252 else:
1253 invalid = b"\x00\x11\x00\x00"
1254 with support.check_warnings():
1255 self.assertRaises(UnicodeDecodeError,
1256 invalid.decode, "unicode_internal")
1257 with support.check_warnings():
1258 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1259 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001260
Victor Stinner182d90d2011-09-29 19:53:55 +02001261 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001262 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001263 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001264 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001265 'deprecated', DeprecationWarning)):
1266 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001267 except UnicodeDecodeError as ex:
1268 self.assertEqual("unicode_internal", ex.encoding)
1269 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1270 self.assertEqual(4, ex.start)
1271 self.assertEqual(8, ex.end)
1272 else:
1273 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001274
Victor Stinner182d90d2011-09-29 19:53:55 +02001275 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001276 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001277 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1278 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001279 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001280 'deprecated', DeprecationWarning)):
1281 ab = "ab".encode("unicode_internal").decode()
1282 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1283 "ascii"),
1284 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001285 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001286
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001287 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001288 with support.check_warnings(('unicode_internal codec has been '
1289 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001290 # Issue 3739
1291 encoder = codecs.getencoder("unicode_internal")
1292 self.assertEqual(encoder("a")[1], 1)
1293 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1294
1295 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001296
Martin v. Löwis2548c732003-04-18 10:39:54 +00001297# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1298nameprep_tests = [
1299 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001300 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1301 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1302 b'\xb8\x8f\xef\xbb\xbf',
1303 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001305 (b'CAFE',
1306 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307 # 3.3 Case folding 8bit U+00DF (german sharp s).
1308 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001309 (b'\xc3\x9f',
1310 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001312 (b'\xc4\xb0',
1313 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'\xc5\x83\xcd\xba',
1316 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1318 # XXX: skip this as it fails in UCS-2 mode
1319 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1320 # 'telc\xe2\x88\x95kg\xcf\x83'),
1321 (None, None),
1322 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001323 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1324 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001326 (b'\xe1\xbe\xb7',
1327 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328 # 3.9 Self-reverting case folding U+01F0 and normalization.
1329 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001330 (b'\xc7\xb0',
1331 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 (b'\xce\x90',
1334 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001336 (b'\xce\xb0',
1337 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001338 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001339 (b'\xe1\xba\x96',
1340 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001342 (b'\xe1\xbd\x96',
1343 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001344 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001345 (b' ',
1346 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001348 (b'\xc2\xa0',
1349 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 None),
1353 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001354 (b'\xe2\x80\x80',
1355 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001356 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001357 (b'\xe2\x80\x8b',
1358 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xe3\x80\x80',
1361 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\x10\x7f',
1364 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 None),
1368 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001369 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 None),
1371 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'\xef\xbb\xbf',
1373 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001375 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 None),
1377 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001378 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 None),
1380 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 None),
1383 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 None),
1386 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 None),
1389 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 None),
1392 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 None),
1395 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 None),
1398 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 None),
1401 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xcd\x81',
1403 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 None),
1407 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 None),
1410 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 None),
1413 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 None),
1416 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 None),
1419 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 None),
1422 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'foo\xef\xb9\xb6bar',
1424 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 None),
1428 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xd8\xa71\xd8\xa8',
1430 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001431 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001432 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001433 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001434 # None),
1435 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 # 3.44 Larger test (shrinking).
1437 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1439 b'\xaa\xce\xb0\xe2\x80\x80',
1440 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 # 3.45 Larger test (expanding).
1442 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1444 b'\x80',
1445 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1446 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1447 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 ]
1449
1450
1451class NameprepTest(unittest.TestCase):
1452 def test_nameprep(self):
1453 from encodings.idna import nameprep
1454 for pos, (orig, prepped) in enumerate(nameprep_tests):
1455 if orig is None:
1456 # Skipped
1457 continue
1458 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001459 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 if prepped is None:
1461 # Input contains prohibited characters
1462 self.assertRaises(UnicodeError, nameprep, orig)
1463 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001464 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001466 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001467 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001468 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001470class IDNACodecTest(unittest.TestCase):
1471 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001472 self.assertEqual(str(b"python.org", "idna"), "python.org")
1473 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1474 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1475 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001476
1477 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001478 self.assertEqual("python.org".encode("idna"), b"python.org")
1479 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1480 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1481 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001482
Martin v. Löwis8b595142005-08-25 11:03:38 +00001483 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001484 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001485 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001486 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001487
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001488 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001489 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001490 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491 "python.org"
1492 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001493 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001494 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001495 "python.org."
1496 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001497 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001498 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001499 "pyth\xf6n.org."
1500 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001501 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001502 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 "pyth\xf6n.org."
1504 )
1505
1506 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001507 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1508 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1509 self.assertEqual(decoder.decode(b"rg"), "")
1510 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001511
1512 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001513 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1514 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1515 self.assertEqual(decoder.decode(b"rg."), "org.")
1516 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001517
1518 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001519 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 b"".join(codecs.iterencode("python.org", "idna")),
1521 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001522 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001524 b"".join(codecs.iterencode("python.org.", "idna")),
1525 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001526 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001527 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001528 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1529 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001530 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001531 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001532 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1533 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534 )
1535
1536 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(encoder.encode("\xe4x"), b"")
1538 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1539 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001540
1541 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001542 self.assertEqual(encoder.encode("\xe4x"), b"")
1543 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1544 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001546class CodecsModuleTest(unittest.TestCase):
1547
1548 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1550 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001551 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001553 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001554
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001555 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001556 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1557 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001558 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001559 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001560 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001562
1563 def test_register(self):
1564 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001565 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001566
1567 def test_lookup(self):
1568 self.assertRaises(TypeError, codecs.lookup)
1569 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001570 self.assertRaises(LookupError, codecs.lookup, " ")
1571
1572 def test_getencoder(self):
1573 self.assertRaises(TypeError, codecs.getencoder)
1574 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1575
1576 def test_getdecoder(self):
1577 self.assertRaises(TypeError, codecs.getdecoder)
1578 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1579
1580 def test_getreader(self):
1581 self.assertRaises(TypeError, codecs.getreader)
1582 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1583
1584 def test_getwriter(self):
1585 self.assertRaises(TypeError, codecs.getwriter)
1586 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001587
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001588 def test_lookup_issue1813(self):
1589 # Issue #1813: under Turkish locales, lookup of some codecs failed
1590 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001591 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001592 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1593 try:
1594 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1595 except locale.Error:
1596 # Unsupported locale on this system
1597 self.skipTest('test needs Turkish locale')
1598 c = codecs.lookup('ASCII')
1599 self.assertEqual(c.name, 'ascii')
1600
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001601class StreamReaderTest(unittest.TestCase):
1602
1603 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001604 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001605 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001606
1607 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001608 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001609 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001610
Thomas Wouters89f507f2006-12-13 04:49:30 +00001611class EncodedFileTest(unittest.TestCase):
1612
1613 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001614 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001615 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001616 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001617
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001618 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001619 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001620 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001621 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001622
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001623all_unicode_encodings = [
1624 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001625 "big5",
1626 "big5hkscs",
1627 "charmap",
1628 "cp037",
1629 "cp1006",
1630 "cp1026",
1631 "cp1140",
1632 "cp1250",
1633 "cp1251",
1634 "cp1252",
1635 "cp1253",
1636 "cp1254",
1637 "cp1255",
1638 "cp1256",
1639 "cp1257",
1640 "cp1258",
1641 "cp424",
1642 "cp437",
1643 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001644 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001645 "cp737",
1646 "cp775",
1647 "cp850",
1648 "cp852",
1649 "cp855",
1650 "cp856",
1651 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001652 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001653 "cp860",
1654 "cp861",
1655 "cp862",
1656 "cp863",
1657 "cp864",
1658 "cp865",
1659 "cp866",
1660 "cp869",
1661 "cp874",
1662 "cp875",
1663 "cp932",
1664 "cp949",
1665 "cp950",
1666 "euc_jis_2004",
1667 "euc_jisx0213",
1668 "euc_jp",
1669 "euc_kr",
1670 "gb18030",
1671 "gb2312",
1672 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001673 "hp_roman8",
1674 "hz",
1675 "idna",
1676 "iso2022_jp",
1677 "iso2022_jp_1",
1678 "iso2022_jp_2",
1679 "iso2022_jp_2004",
1680 "iso2022_jp_3",
1681 "iso2022_jp_ext",
1682 "iso2022_kr",
1683 "iso8859_1",
1684 "iso8859_10",
1685 "iso8859_11",
1686 "iso8859_13",
1687 "iso8859_14",
1688 "iso8859_15",
1689 "iso8859_16",
1690 "iso8859_2",
1691 "iso8859_3",
1692 "iso8859_4",
1693 "iso8859_5",
1694 "iso8859_6",
1695 "iso8859_7",
1696 "iso8859_8",
1697 "iso8859_9",
1698 "johab",
1699 "koi8_r",
1700 "koi8_u",
1701 "latin_1",
1702 "mac_cyrillic",
1703 "mac_greek",
1704 "mac_iceland",
1705 "mac_latin2",
1706 "mac_roman",
1707 "mac_turkish",
1708 "palmos",
1709 "ptcp154",
1710 "punycode",
1711 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001712 "shift_jis",
1713 "shift_jis_2004",
1714 "shift_jisx0213",
1715 "tis_620",
1716 "unicode_escape",
1717 "unicode_internal",
1718 "utf_16",
1719 "utf_16_be",
1720 "utf_16_le",
1721 "utf_7",
1722 "utf_8",
1723]
1724
1725if hasattr(codecs, "mbcs_encode"):
1726 all_unicode_encodings.append("mbcs")
1727
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001728# The following encoding is not tested, because it's not supposed
1729# to work:
1730# "undefined"
1731
1732# The following encodings don't work in stateful mode
1733broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001734 "punycode",
1735 "unicode_internal"
1736]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001737broken_incremental_coders = broken_unicode_with_streams + [
1738 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001739]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001740
Walter Dörwald3abcb012007-04-16 22:10:50 +00001741class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001742 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001743 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001744 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001745 name = codecs.lookup(encoding).name
1746 if encoding.endswith("_codec"):
1747 name += "_codec"
1748 elif encoding == "latin_1":
1749 name = "latin_1"
1750 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001751
Ezio Melottiadc417c2011-11-17 12:23:34 +02001752 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001753 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001754 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001755 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001756 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001757 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001758
1759 if encoding not in broken_unicode_with_streams:
1760 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001761 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001762 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001763 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001764 for c in s:
1765 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001766 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001767 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001768 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001769 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001770 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001771 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001772 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001773 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001774 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001775 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001776
Thomas Wouters89f507f2006-12-13 04:49:30 +00001777 if encoding not in broken_incremental_coders:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001778 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001779 try:
1780 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001781 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001782 pass
1783 else:
1784 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001785 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001786 for c in s:
1787 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001788 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001789 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001790 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001791 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001792 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001793 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001794 self.assertEqual(decodedresult, s,
1795 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001796
1797 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001798 result = "".join(codecs.iterdecode(
1799 codecs.iterencode(s, encoding), encoding))
1800 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001801
1802 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001803 result = "".join(codecs.iterdecode(
1804 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001805 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001806
Victor Stinner554f3f02010-06-16 23:33:54 +00001807 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001808 # check incremental decoder/encoder with errors argument
1809 try:
1810 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001811 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001812 pass
1813 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001814 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001815 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001816 decodedresult = "".join(decoder.decode(bytes([c]))
1817 for c in encodedresult)
1818 self.assertEqual(decodedresult, s,
1819 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001820
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001821 @support.cpython_only
1822 def test_basics_capi(self):
1823 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1824 s = "abc123" # all codecs should be able to encode these
1825 for encoding in all_unicode_encodings:
1826 if encoding not in broken_incremental_coders:
1827 # check incremental decoder/encoder (fetched via the C API)
1828 try:
1829 cencoder = codec_incrementalencoder(encoding)
1830 except LookupError: # no IncrementalEncoder
1831 pass
1832 else:
1833 # check C API
1834 encodedresult = b""
1835 for c in s:
1836 encodedresult += cencoder.encode(c)
1837 encodedresult += cencoder.encode("", True)
1838 cdecoder = codec_incrementaldecoder(encoding)
1839 decodedresult = ""
1840 for c in encodedresult:
1841 decodedresult += cdecoder.decode(bytes([c]))
1842 decodedresult += cdecoder.decode(b"", True)
1843 self.assertEqual(decodedresult, s,
1844 "encoding=%r" % encoding)
1845
1846 if encoding not in ("idna", "mbcs"):
1847 # check incremental decoder/encoder with errors argument
1848 try:
1849 cencoder = codec_incrementalencoder(encoding, "ignore")
1850 except LookupError: # no IncrementalEncoder
1851 pass
1852 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001853 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001854 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1855 decodedresult = "".join(cdecoder.decode(bytes([c]))
1856 for c in encodedresult)
1857 self.assertEqual(decodedresult, s,
1858 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001859
Walter Dörwald729c31f2005-03-14 19:06:30 +00001860 def test_seek(self):
1861 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001862 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001863 for encoding in all_unicode_encodings:
1864 if encoding == "idna": # FIXME: See SF bug #1163178
1865 continue
1866 if encoding in broken_unicode_with_streams:
1867 continue
Victor Stinner05010702011-05-27 16:50:40 +02001868 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001869 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001870 # Test that calling seek resets the internal codec state and buffers
1871 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001872 data = reader.read()
1873 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001874
Walter Dörwalde22d3392005-11-17 08:52:34 +00001875 def test_bad_decode_args(self):
1876 for encoding in all_unicode_encodings:
1877 decoder = codecs.getdecoder(encoding)
1878 self.assertRaises(TypeError, decoder)
1879 if encoding not in ("idna", "punycode"):
1880 self.assertRaises(TypeError, decoder, 42)
1881
1882 def test_bad_encode_args(self):
1883 for encoding in all_unicode_encodings:
1884 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001885 with support.check_warnings():
1886 # unicode-internal has been deprecated
1887 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001888
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001889 def test_encoding_map_type_initialized(self):
1890 from encodings import cp1140
1891 # This used to crash, we are only verifying there's no crash.
1892 table_type = type(cp1140.encoding_table)
1893 self.assertEqual(table_type, table_type)
1894
Walter Dörwald3abcb012007-04-16 22:10:50 +00001895 def test_decoder_state(self):
1896 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001897 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001898 for encoding in all_unicode_encodings:
1899 if encoding not in broken_incremental_coders:
1900 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1901 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1902
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001903class CharmapTest(unittest.TestCase):
1904 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001905 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001906 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001907 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001908 )
1909
Ezio Melottib3aedd42010-11-20 19:04:17 +00001910 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001911 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1912 ("\U0010FFFFbc", 3)
1913 )
1914
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001915 self.assertRaises(UnicodeDecodeError,
1916 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1917 )
1918
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001919 self.assertRaises(UnicodeDecodeError,
1920 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1921 )
1922
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001923 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001924 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001925 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001926 )
1927
Ezio Melottib3aedd42010-11-20 19:04:17 +00001928 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001929 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001930 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001931 )
1932
Ezio Melottib3aedd42010-11-20 19:04:17 +00001933 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001934 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001935 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001936 )
1937
Ezio Melottib3aedd42010-11-20 19:04:17 +00001938 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001939 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001940 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001941 )
1942
Guido van Rossum805365e2007-05-07 22:24:25 +00001943 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001944 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001945 codecs.charmap_decode(allbytes, "ignore", ""),
1946 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001947 )
1948
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001949 def test_decode_with_int2str_map(self):
1950 self.assertEqual(
1951 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1952 {0: 'a', 1: 'b', 2: 'c'}),
1953 ("abc", 3)
1954 )
1955
1956 self.assertEqual(
1957 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1958 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1959 ("AaBbCc", 3)
1960 )
1961
1962 self.assertEqual(
1963 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1964 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1965 ("\U0010FFFFbc", 3)
1966 )
1967
1968 self.assertEqual(
1969 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1970 {0: 'a', 1: 'b', 2: ''}),
1971 ("ab", 3)
1972 )
1973
1974 self.assertRaises(UnicodeDecodeError,
1975 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1976 {0: 'a', 1: 'b'}
1977 )
1978
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001979 self.assertRaises(UnicodeDecodeError,
1980 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1981 {0: 'a', 1: 'b', 2: None}
1982 )
1983
1984 # Issue #14850
1985 self.assertRaises(UnicodeDecodeError,
1986 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1987 {0: 'a', 1: 'b', 2: '\ufffe'}
1988 )
1989
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001990 self.assertEqual(
1991 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1992 {0: 'a', 1: 'b'}),
1993 ("ab\ufffd", 3)
1994 )
1995
1996 self.assertEqual(
1997 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1998 {0: 'a', 1: 'b', 2: None}),
1999 ("ab\ufffd", 3)
2000 )
2001
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002002 # Issue #14850
2003 self.assertEqual(
2004 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2005 {0: 'a', 1: 'b', 2: '\ufffe'}),
2006 ("ab\ufffd", 3)
2007 )
2008
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002009 self.assertEqual(
2010 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2011 {0: 'a', 1: 'b'}),
2012 ("ab", 3)
2013 )
2014
2015 self.assertEqual(
2016 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2017 {0: 'a', 1: 'b', 2: None}),
2018 ("ab", 3)
2019 )
2020
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002021 # Issue #14850
2022 self.assertEqual(
2023 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2024 {0: 'a', 1: 'b', 2: '\ufffe'}),
2025 ("ab", 3)
2026 )
2027
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002028 allbytes = bytes(range(256))
2029 self.assertEqual(
2030 codecs.charmap_decode(allbytes, "ignore", {}),
2031 ("", len(allbytes))
2032 )
2033
2034 def test_decode_with_int2int_map(self):
2035 a = ord('a')
2036 b = ord('b')
2037 c = ord('c')
2038
2039 self.assertEqual(
2040 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2041 {0: a, 1: b, 2: c}),
2042 ("abc", 3)
2043 )
2044
2045 # Issue #15379
2046 self.assertEqual(
2047 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2048 {0: 0x10FFFF, 1: b, 2: c}),
2049 ("\U0010FFFFbc", 3)
2050 )
2051
Antoine Pitroua1f76552012-09-23 20:00:04 +02002052 self.assertEqual(
2053 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2054 {0: sys.maxunicode, 1: b, 2: c}),
2055 (chr(sys.maxunicode) + "bc", 3)
2056 )
2057
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002058 self.assertRaises(TypeError,
2059 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002060 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002061 )
2062
2063 self.assertRaises(UnicodeDecodeError,
2064 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2065 {0: a, 1: b},
2066 )
2067
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002068 self.assertRaises(UnicodeDecodeError,
2069 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2070 {0: a, 1: b, 2: 0xFFFE},
2071 )
2072
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002073 self.assertEqual(
2074 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2075 {0: a, 1: b}),
2076 ("ab\ufffd", 3)
2077 )
2078
2079 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002080 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2081 {0: a, 1: b, 2: 0xFFFE}),
2082 ("ab\ufffd", 3)
2083 )
2084
2085 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002086 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2087 {0: a, 1: b}),
2088 ("ab", 3)
2089 )
2090
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002091 self.assertEqual(
2092 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2093 {0: a, 1: b, 2: 0xFFFE}),
2094 ("ab", 3)
2095 )
2096
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002097
Thomas Wouters89f507f2006-12-13 04:49:30 +00002098class WithStmtTest(unittest.TestCase):
2099 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002100 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002101 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2102 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002103
2104 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002105 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002106 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002107 with codecs.StreamReaderWriter(f, info.streamreader,
2108 info.streamwriter, 'strict') as srw:
2109 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002110
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002111class TypesTest(unittest.TestCase):
2112 def test_decode_unicode(self):
2113 # Most decoders don't accept unicode input
2114 decoders = [
2115 codecs.utf_7_decode,
2116 codecs.utf_8_decode,
2117 codecs.utf_16_le_decode,
2118 codecs.utf_16_be_decode,
2119 codecs.utf_16_ex_decode,
2120 codecs.utf_32_decode,
2121 codecs.utf_32_le_decode,
2122 codecs.utf_32_be_decode,
2123 codecs.utf_32_ex_decode,
2124 codecs.latin_1_decode,
2125 codecs.ascii_decode,
2126 codecs.charmap_decode,
2127 ]
2128 if hasattr(codecs, "mbcs_decode"):
2129 decoders.append(codecs.mbcs_decode)
2130 for decoder in decoders:
2131 self.assertRaises(TypeError, decoder, "xxx")
2132
2133 def test_unicode_escape(self):
2134 # Escape-decoding an unicode string is supported ang gives the same
2135 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002136 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2137 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2138 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2139 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002140
Victor Stinnere3b47152011-12-09 20:49:49 +01002141 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2142 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2143
2144 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2145 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2146
Serhiy Storchakad6793772013-01-29 10:20:44 +02002147
2148class UnicodeEscapeTest(unittest.TestCase):
2149 def test_empty(self):
2150 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2151 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2152
2153 def test_raw_encode(self):
2154 encode = codecs.unicode_escape_encode
2155 for b in range(32, 127):
2156 if b != b'\\'[0]:
2157 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2158
2159 def test_raw_decode(self):
2160 decode = codecs.unicode_escape_decode
2161 for b in range(256):
2162 if b != b'\\'[0]:
2163 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2164
2165 def test_escape_encode(self):
2166 encode = codecs.unicode_escape_encode
2167 check = coding_checker(self, encode)
2168 check('\t', br'\t')
2169 check('\n', br'\n')
2170 check('\r', br'\r')
2171 check('\\', br'\\')
2172 for b in range(32):
2173 if chr(b) not in '\t\n\r':
2174 check(chr(b), ('\\x%02x' % b).encode())
2175 for b in range(127, 256):
2176 check(chr(b), ('\\x%02x' % b).encode())
2177 check('\u20ac', br'\u20ac')
2178 check('\U0001d120', br'\U0001d120')
2179
2180 def test_escape_decode(self):
2181 decode = codecs.unicode_escape_decode
2182 check = coding_checker(self, decode)
2183 check(b"[\\\n]", "[]")
2184 check(br'[\"]', '["]')
2185 check(br"[\']", "[']")
2186 check(br"[\\]", r"[\]")
2187 check(br"[\a]", "[\x07]")
2188 check(br"[\b]", "[\x08]")
2189 check(br"[\t]", "[\x09]")
2190 check(br"[\n]", "[\x0a]")
2191 check(br"[\v]", "[\x0b]")
2192 check(br"[\f]", "[\x0c]")
2193 check(br"[\r]", "[\x0d]")
2194 check(br"[\7]", "[\x07]")
2195 check(br"[\8]", r"[\8]")
2196 check(br"[\78]", "[\x078]")
2197 check(br"[\41]", "[!]")
2198 check(br"[\418]", "[!8]")
2199 check(br"[\101]", "[A]")
2200 check(br"[\1010]", "[A0]")
2201 check(br"[\x41]", "[A]")
2202 check(br"[\x410]", "[A0]")
2203 check(br"\u20ac", "\u20ac")
2204 check(br"\U0001d120", "\U0001d120")
2205 for b in range(256):
2206 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2207 check(b'\\' + bytes([b]), '\\' + chr(b))
2208
2209 def test_decode_errors(self):
2210 decode = codecs.unicode_escape_decode
2211 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2212 for i in range(d):
2213 self.assertRaises(UnicodeDecodeError, decode,
2214 b"\\" + c + b"0"*i)
2215 self.assertRaises(UnicodeDecodeError, decode,
2216 b"[\\" + c + b"0"*i + b"]")
2217 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2218 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2219 self.assertEqual(decode(data, "replace"),
2220 ("[\ufffd]\ufffd", len(data)))
2221 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2222 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2223 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2224
2225
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002226class RawUnicodeEscapeTest(unittest.TestCase):
2227 def test_empty(self):
2228 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2229 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2230
2231 def test_raw_encode(self):
2232 encode = codecs.raw_unicode_escape_encode
2233 for b in range(256):
2234 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2235
2236 def test_raw_decode(self):
2237 decode = codecs.raw_unicode_escape_decode
2238 for b in range(256):
2239 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2240
2241 def test_escape_encode(self):
2242 encode = codecs.raw_unicode_escape_encode
2243 check = coding_checker(self, encode)
2244 for b in range(256):
2245 if b not in b'uU':
2246 check('\\' + chr(b), b'\\' + bytes([b]))
2247 check('\u20ac', br'\u20ac')
2248 check('\U0001d120', br'\U0001d120')
2249
2250 def test_escape_decode(self):
2251 decode = codecs.raw_unicode_escape_decode
2252 check = coding_checker(self, decode)
2253 for b in range(256):
2254 if b not in b'uU':
2255 check(b'\\' + bytes([b]), '\\' + chr(b))
2256 check(br"\u20ac", "\u20ac")
2257 check(br"\U0001d120", "\U0001d120")
2258
2259 def test_decode_errors(self):
2260 decode = codecs.raw_unicode_escape_decode
2261 for c, d in (b'u', 4), (b'U', 4):
2262 for i in range(d):
2263 self.assertRaises(UnicodeDecodeError, decode,
2264 b"\\" + c + b"0"*i)
2265 self.assertRaises(UnicodeDecodeError, decode,
2266 b"[\\" + c + b"0"*i + b"]")
2267 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2268 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2269 self.assertEqual(decode(data, "replace"),
2270 ("[\ufffd]\ufffd", len(data)))
2271 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2272 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2273 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2274
2275
Martin v. Löwis43c57782009-05-10 08:15:24 +00002276class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002277
2278 def test_utf8(self):
2279 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002280 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002281 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002282 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002283 b"foo\x80bar")
2284 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002285 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002286 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002287 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002288 b"\xed\xb0\x80")
2289
2290 def test_ascii(self):
2291 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002292 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002293 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002294 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002295 b"foo\x80bar")
2296
2297 def test_charmap(self):
2298 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002299 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002300 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002301 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002302 b"foo\xa5bar")
2303
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002304 def test_latin1(self):
2305 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002306 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002307 b"\xe4\xeb\xef\xf6\xfc")
2308
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002309
Victor Stinner3fed0872010-05-22 02:16:27 +00002310class BomTest(unittest.TestCase):
2311 def test_seek0(self):
2312 data = "1234567890"
2313 tests = ("utf-16",
2314 "utf-16-le",
2315 "utf-16-be",
2316 "utf-32",
2317 "utf-32-le",
2318 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002319 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002320 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002321 # Check if the BOM is written only once
2322 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002323 f.write(data)
2324 f.write(data)
2325 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002326 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002327 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002328 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002329
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002330 # Check that the BOM is written after a seek(0)
2331 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2332 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002333 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002334 f.seek(0)
2335 f.write(data)
2336 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002337 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002338
2339 # (StreamWriter) Check that the BOM is written after a seek(0)
2340 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002341 f.writer.write(data[0])
2342 self.assertNotEqual(f.writer.tell(), 0)
2343 f.writer.seek(0)
2344 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002345 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002346 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002347
Victor Stinner05010702011-05-27 16:50:40 +02002348 # Check that the BOM is not written after a seek() at a position
2349 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002350 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2351 f.write(data)
2352 f.seek(f.tell())
2353 f.write(data)
2354 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002355 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002356
Victor Stinner05010702011-05-27 16:50:40 +02002357 # (StreamWriter) Check that the BOM is not written after a seek()
2358 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002359 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002360 f.writer.write(data)
2361 f.writer.seek(f.writer.tell())
2362 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002363 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002364 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002365
Victor Stinner3fed0872010-05-22 02:16:27 +00002366
Georg Brandl02524622010-12-02 18:06:51 +00002367bytes_transform_encodings = [
2368 "base64_codec",
2369 "uu_codec",
2370 "quopri_codec",
2371 "hex_codec",
2372]
2373try:
2374 import zlib
2375except ImportError:
2376 pass
2377else:
2378 bytes_transform_encodings.append("zlib_codec")
2379try:
2380 import bz2
2381except ImportError:
2382 pass
2383else:
2384 bytes_transform_encodings.append("bz2_codec")
2385
2386class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002387
Georg Brandl02524622010-12-02 18:06:51 +00002388 def test_basics(self):
2389 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002390 for encoding in bytes_transform_encodings:
2391 # generic codecs interface
2392 (o, size) = codecs.getencoder(encoding)(binput)
2393 self.assertEqual(size, len(binput))
2394 (i, size) = codecs.getdecoder(encoding)(o)
2395 self.assertEqual(size, len(o))
2396 self.assertEqual(i, binput)
2397
Georg Brandl02524622010-12-02 18:06:51 +00002398 def test_read(self):
2399 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002400 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002401 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002402 sout = reader.read()
2403 self.assertEqual(sout, b"\x80")
2404
2405 def test_readline(self):
2406 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002407 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002408 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002409 sout = reader.readline()
2410 self.assertEqual(sout, b"\x80")
2411
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02002412 def test_text_to_binary_blacklists_binary_transforms(self):
2413 # Check binary -> binary codecs give a good error for str input
2414 bad_input = "bad input type"
2415 for encoding in bytes_transform_encodings:
2416 fmt = (r"{!r} is not a text encoding; "
2417 r"use codecs.encode\(\) to handle arbitrary codecs")
2418 msg = fmt.format(encoding)
2419 with self.assertRaisesRegex(LookupError, msg) as failure:
2420 bad_input.encode(encoding)
2421 self.assertIsNone(failure.exception.__cause__)
2422
2423 def test_text_to_binary_blacklists_text_transforms(self):
2424 # Check str.encode gives a good error message for str -> str codecs
2425 msg = (r"^'rot_13' is not a text encoding; "
2426 r"use codecs.encode\(\) to handle arbitrary codecs")
2427 with self.assertRaisesRegex(LookupError, msg):
2428 "just an example message".encode("rot_13")
2429
2430 def test_binary_to_text_blacklists_binary_transforms(self):
2431 # Check bytes.decode and bytearray.decode give a good error
2432 # message for binary -> binary codecs
2433 data = b"encode first to ensure we meet any format restrictions"
2434 for encoding in bytes_transform_encodings:
2435 encoded_data = codecs.encode(data, encoding)
2436 fmt = (r"{!r} is not a text encoding; "
2437 r"use codecs.decode\(\) to handle arbitrary codecs")
2438 msg = fmt.format(encoding)
2439 with self.assertRaisesRegex(LookupError, msg):
2440 encoded_data.decode(encoding)
2441 with self.assertRaisesRegex(LookupError, msg):
2442 bytearray(encoded_data).decode(encoding)
2443
2444 def test_binary_to_text_blacklists_text_transforms(self):
2445 # Check str -> str codec gives a good error for binary input
2446 for bad_input in (b"immutable", bytearray(b"mutable")):
2447 msg = (r"^'rot_13' is not a text encoding; "
2448 r"use codecs.decode\(\) to handle arbitrary codecs")
2449 with self.assertRaisesRegex(LookupError, msg) as failure:
2450 bad_input.decode("rot_13")
2451 self.assertIsNone(failure.exception.__cause__)
2452
Georg Brandl02524622010-12-02 18:06:51 +00002453
Victor Stinner62be4fb2011-10-18 21:46:37 +02002454@unittest.skipUnless(sys.platform == 'win32',
2455 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002456class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002457 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002458 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002459
Victor Stinner3a50e702011-10-18 21:21:00 +02002460 def test_invalid_code_page(self):
2461 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2462 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2463 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2464 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2465
2466 def test_code_page_name(self):
2467 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2468 codecs.code_page_encode, 932, '\xff')
2469 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2470 codecs.code_page_decode, 932, b'\x81\x00')
2471 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2472 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2473
2474 def check_decode(self, cp, tests):
2475 for raw, errors, expected in tests:
2476 if expected is not None:
2477 try:
2478 decoded = codecs.code_page_decode(cp, raw, errors)
2479 except UnicodeDecodeError as err:
2480 self.fail('Unable to decode %a from "cp%s" with '
2481 'errors=%r: %s' % (raw, cp, errors, err))
2482 self.assertEqual(decoded[0], expected,
2483 '%a.decode("cp%s", %r)=%a != %a'
2484 % (raw, cp, errors, decoded[0], expected))
2485 # assert 0 <= decoded[1] <= len(raw)
2486 self.assertGreaterEqual(decoded[1], 0)
2487 self.assertLessEqual(decoded[1], len(raw))
2488 else:
2489 self.assertRaises(UnicodeDecodeError,
2490 codecs.code_page_decode, cp, raw, errors)
2491
2492 def check_encode(self, cp, tests):
2493 for text, errors, expected in tests:
2494 if expected is not None:
2495 try:
2496 encoded = codecs.code_page_encode(cp, text, errors)
2497 except UnicodeEncodeError as err:
2498 self.fail('Unable to encode %a to "cp%s" with '
2499 'errors=%r: %s' % (text, cp, errors, err))
2500 self.assertEqual(encoded[0], expected,
2501 '%a.encode("cp%s", %r)=%a != %a'
2502 % (text, cp, errors, encoded[0], expected))
2503 self.assertEqual(encoded[1], len(text))
2504 else:
2505 self.assertRaises(UnicodeEncodeError,
2506 codecs.code_page_encode, cp, text, errors)
2507
2508 def test_cp932(self):
2509 self.check_encode(932, (
2510 ('abc', 'strict', b'abc'),
2511 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002512 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002513 ('\xff', 'strict', None),
2514 ('[\xff]', 'ignore', b'[]'),
2515 ('[\xff]', 'replace', b'[y]'),
2516 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002517 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2518 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002519 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002520 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002521 (b'abc', 'strict', 'abc'),
2522 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2523 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002524 (b'[\xff]', 'strict', None),
2525 (b'[\xff]', 'ignore', '[]'),
2526 (b'[\xff]', 'replace', '[\ufffd]'),
2527 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002528 (b'\x81\x00abc', 'strict', None),
2529 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002530 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2531 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002532
2533 def test_cp1252(self):
2534 self.check_encode(1252, (
2535 ('abc', 'strict', b'abc'),
2536 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2537 ('\xff', 'strict', b'\xff'),
2538 ('\u0141', 'strict', None),
2539 ('\u0141', 'ignore', b''),
2540 ('\u0141', 'replace', b'L'),
2541 ))
2542 self.check_decode(1252, (
2543 (b'abc', 'strict', 'abc'),
2544 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2545 (b'\xff', 'strict', '\xff'),
2546 ))
2547
2548 def test_cp_utf7(self):
2549 cp = 65000
2550 self.check_encode(cp, (
2551 ('abc', 'strict', b'abc'),
2552 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2553 ('\U0010ffff', 'strict', b'+2//f/w-'),
2554 ('\udc80', 'strict', b'+3IA-'),
2555 ('\ufffd', 'strict', b'+//0-'),
2556 ))
2557 self.check_decode(cp, (
2558 (b'abc', 'strict', 'abc'),
2559 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2560 (b'+2//f/w-', 'strict', '\U0010ffff'),
2561 (b'+3IA-', 'strict', '\udc80'),
2562 (b'+//0-', 'strict', '\ufffd'),
2563 # invalid bytes
2564 (b'[+/]', 'strict', '[]'),
2565 (b'[\xff]', 'strict', '[\xff]'),
2566 ))
2567
Victor Stinner3a50e702011-10-18 21:21:00 +02002568 def test_multibyte_encoding(self):
2569 self.check_decode(932, (
2570 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2571 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2572 ))
2573 self.check_decode(self.CP_UTF8, (
2574 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2575 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2576 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002577 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002578 self.check_encode(self.CP_UTF8, (
2579 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2580 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2581 ))
2582
2583 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002584 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2585 self.assertEqual(decoded, ('', 0))
2586
Victor Stinner3a50e702011-10-18 21:21:00 +02002587 decoded = codecs.code_page_decode(932,
2588 b'\xe9\x80\xe9', 'strict',
2589 False)
2590 self.assertEqual(decoded, ('\u9a3e', 2))
2591
2592 decoded = codecs.code_page_decode(932,
2593 b'\xe9\x80\xe9\x80', 'strict',
2594 False)
2595 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2596
2597 decoded = codecs.code_page_decode(932,
2598 b'abc', 'strict',
2599 False)
2600 self.assertEqual(decoded, ('abc', 3))
2601
2602
Fred Drake2e2be372001-09-20 21:33:42 +00002603if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002604 unittest.main()