blob: 3950c3bc2080086d4422877eac0a8311cd3e39df [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10003import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01004import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02005import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01006import sys
7import unittest
8import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10009import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +010010
11from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020012
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020013if sys.platform == 'win32':
14 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
15else:
16 VISTA_OR_LATER = False
17
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018try:
19 import ctypes
20except ImportError:
21 ctypes = None
22 SIZEOF_WCHAR_T = -1
23else:
24 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000025
Serhiy Storchakad6793772013-01-29 10:20:44 +020026def coding_checker(self, coder):
27 def check(input, expect):
28 self.assertEqual(coder(input), (expect, len(input)))
29 return check
30
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Walter Dörwald3abcb012007-04-16 22:10:50 +000051class MixInCheckStateHandling:
52 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000053 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 d = codecs.getincrementaldecoder(encoding)()
55 part1 = d.decode(s[:i])
56 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000057 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # Check that the condition stated in the documentation for
59 # IncrementalDecoder.getstate() holds
60 if not state[1]:
61 # reset decoder to the default state without anything buffered
62 d.setstate((state[0][:0], 0))
63 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000064 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000065 # The decoder must return to the same state
66 self.assertEqual(state, d.getstate())
67 # Create a new decoder and set it to the state
68 # we extracted from the old one
69 d = codecs.getincrementaldecoder(encoding)()
70 d.setstate(state)
71 part2 = d.decode(s[i:], True)
72 self.assertEqual(u, part1+part2)
73
74 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000075 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000076 d = codecs.getincrementalencoder(encoding)()
77 part1 = d.encode(u[:i])
78 state = d.getstate()
79 d = codecs.getincrementalencoder(encoding)()
80 d.setstate(state)
81 part2 = d.encode(u[i:], True)
82 self.assertEqual(s, part1+part2)
83
Ezio Melotti5d3dba02013-01-11 06:02:07 +020084class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000086 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000087 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000088 # the StreamReader and check that the results equal the appropriate
89 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020091 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000095 result += r.read()
96 self.assertEqual(result, partialresult)
97 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000098 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000100
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 # do the check again, this time using a incremental decoder
102 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000103 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 self.assertEqual(result, partialresult)
107 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 self.assertEqual(d.decode(b"", True), "")
109 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000110
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000113 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 self.assertEqual(result, partialresult)
117 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 self.assertEqual(d.decode(b"", True), "")
119 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000120
121 # check iterdecode()
122 encoded = input.encode(self.encoding)
123 self.assertEqual(
124 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000125 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 )
127
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 def test_readline(self):
129 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000130 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 return codecs.getreader(self.encoding)(stream)
132
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200134 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 lines = []
136 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138 if not line:
139 break
140 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000141 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000142
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
144 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
145 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000146 self.assertEqual(readalllines(s, True), sexpected)
147 self.assertEqual(readalllines(s, False), sexpectednoends)
148 self.assertEqual(readalllines(s, True, 10), sexpected)
149 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150
151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000154 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
155 vw.append((i*200)*"\3042" + lineend)
156 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
158 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000163 for lineend in "\n \r\n \r \u2028".split():
164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
171 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000172 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 self.assertEqual(
174 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000175 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000176 )
177
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200178 def test_mixed_readline_and_read(self):
179 lines = ["Humpty Dumpty sat on a wall,\n",
180 "Humpty Dumpty had a great fall.\r\n",
181 "All the king's horses and all the king's men\r",
182 "Couldn't put Humpty together again."]
183 data = ''.join(lines)
184 def getreader():
185 stream = io.BytesIO(data.encode(self.encoding))
186 return codecs.getreader(self.encoding)(stream)
187
188 # Issue #8260: Test readline() followed by read()
189 f = getreader()
190 self.assertEqual(f.readline(), lines[0])
191 self.assertEqual(f.read(), ''.join(lines[1:]))
192 self.assertEqual(f.read(), '')
193
194 # Issue #16636: Test readline() followed by readlines()
195 f = getreader()
196 self.assertEqual(f.readline(), lines[0])
197 self.assertEqual(f.readlines(), lines[1:])
198 self.assertEqual(f.read(), '')
199
200 # Test read() followed by read()
201 f = getreader()
202 self.assertEqual(f.read(size=40, chars=5), data[:5])
203 self.assertEqual(f.read(), data[5:])
204 self.assertEqual(f.read(), '')
205
206 # Issue #12446: Test read() followed by readlines()
207 f = getreader()
208 self.assertEqual(f.read(size=40, chars=5), data[:5])
209 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
210 self.assertEqual(f.read(), '')
211
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000212 def test_bug1175396(self):
213 s = [
214 '<%!--===================================================\r\n',
215 ' BLOG index page: show recent articles,\r\n',
216 ' today\'s articles, or articles of a specific date.\r\n',
217 '========================================================--%>\r\n',
218 '<%@inputencoding="ISO-8859-1"%>\r\n',
219 '<%@pagetemplate=TEMPLATE.y%>\r\n',
220 '<%@import=import frog.util, frog%>\r\n',
221 '<%@import=import frog.objects%>\r\n',
222 '<%@import=from frog.storageerrors import StorageError%>\r\n',
223 '<%\r\n',
224 '\r\n',
225 'import logging\r\n',
226 'log=logging.getLogger("Snakelets.logger")\r\n',
227 '\r\n',
228 '\r\n',
229 'user=self.SessionCtx.user\r\n',
230 'storageEngine=self.SessionCtx.storageEngine\r\n',
231 '\r\n',
232 '\r\n',
233 'def readArticlesFromDate(date, count=None):\r\n',
234 ' entryids=storageEngine.listBlogEntries(date)\r\n',
235 ' entryids.reverse() # descending\r\n',
236 ' if count:\r\n',
237 ' entryids=entryids[:count]\r\n',
238 ' try:\r\n',
239 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
240 ' except StorageError,x:\r\n',
241 ' log.error("Error loading articles: "+str(x))\r\n',
242 ' self.abort("cannot load articles")\r\n',
243 '\r\n',
244 'showdate=None\r\n',
245 '\r\n',
246 'arg=self.Request.getArg()\r\n',
247 'if arg=="today":\r\n',
248 ' #-------------------- TODAY\'S ARTICLES\r\n',
249 ' self.write("<h2>Today\'s articles</h2>")\r\n',
250 ' showdate = frog.util.isodatestr() \r\n',
251 ' entries = readArticlesFromDate(showdate)\r\n',
252 'elif arg=="active":\r\n',
253 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
254 ' self.Yredirect("active.y")\r\n',
255 'elif arg=="login":\r\n',
256 ' #-------------------- LOGIN PAGE redirect\r\n',
257 ' self.Yredirect("login.y")\r\n',
258 'elif arg=="date":\r\n',
259 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
260 ' showdate = self.Request.getParameter("date")\r\n',
261 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
262 ' entries = readArticlesFromDate(showdate)\r\n',
263 'else:\r\n',
264 ' #-------------------- RECENT ARTICLES\r\n',
265 ' self.write("<h2>Recent articles</h2>")\r\n',
266 ' dates=storageEngine.listBlogEntryDates()\r\n',
267 ' if dates:\r\n',
268 ' entries=[]\r\n',
269 ' SHOWAMOUNT=10\r\n',
270 ' for showdate in dates:\r\n',
271 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
272 ' if len(entries)>=SHOWAMOUNT:\r\n',
273 ' break\r\n',
274 ' \r\n',
275 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000276 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200277 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000278 for (i, line) in enumerate(reader):
279 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000280
281 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000282 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200283 writer = codecs.getwriter(self.encoding)(q)
284 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000285
286 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000287 writer.write("foo\r")
288 self.assertEqual(reader.readline(keepends=False), "foo")
289 writer.write("\nbar\r")
290 self.assertEqual(reader.readline(keepends=False), "")
291 self.assertEqual(reader.readline(keepends=False), "bar")
292 writer.write("baz")
293 self.assertEqual(reader.readline(keepends=False), "baz")
294 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000295
296 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000297 writer.write("foo\r")
298 self.assertEqual(reader.readline(keepends=True), "foo\r")
299 writer.write("\nbar\r")
300 self.assertEqual(reader.readline(keepends=True), "\n")
301 self.assertEqual(reader.readline(keepends=True), "bar\r")
302 writer.write("baz")
303 self.assertEqual(reader.readline(keepends=True), "baz")
304 self.assertEqual(reader.readline(keepends=True), "")
305 writer.write("foo\r\n")
306 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000307
Walter Dörwald9fa09462005-01-10 12:01:39 +0000308 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000309 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
310 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
311 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000312
313 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000314 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200315 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 self.assertEqual(reader.readline(), s1)
317 self.assertEqual(reader.readline(), s2)
318 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000319 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000322 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
323 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
324 s3 = "stillokay:bbbbxx\r\n"
325 s4 = "broken!!!!badbad\r\n"
326 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327
328 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000329 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200330 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000331 self.assertEqual(reader.readline(), s1)
332 self.assertEqual(reader.readline(), s2)
333 self.assertEqual(reader.readline(), s3)
334 self.assertEqual(reader.readline(), s4)
335 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000336 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200338 ill_formed_sequence_replace = "\ufffd"
339
340 def test_lone_surrogates(self):
341 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
342 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
343 "[\\udc80]".encode(self.encoding))
344 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
345 "[&#56448;]".encode(self.encoding))
346 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
347 "[]".encode(self.encoding))
348 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
349 "[?]".encode(self.encoding))
350
351 bom = "".encode(self.encoding)
352 for before, after in [("\U00010fff", "A"), ("[", "]"),
353 ("A", "\U00010fff")]:
354 before_sequence = before.encode(self.encoding)[len(bom):]
355 after_sequence = after.encode(self.encoding)[len(bom):]
356 test_string = before + "\uDC80" + after
357 test_sequence = (bom + before_sequence +
358 self.ill_formed_sequence + after_sequence)
359 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
360 self.encoding)
361 self.assertEqual(test_string.encode(self.encoding,
362 "surrogatepass"),
363 test_sequence)
364 self.assertEqual(test_sequence.decode(self.encoding,
365 "surrogatepass"),
366 test_string)
367 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
368 before + after)
369 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
370 before + self.ill_formed_sequence_replace + after)
371
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200372class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000373 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200374 if sys.byteorder == 'little':
375 ill_formed_sequence = b"\x80\xdc\x00\x00"
376 else:
377 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000378
379 spamle = (b'\xff\xfe\x00\x00'
380 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
381 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
382 spambe = (b'\x00\x00\xfe\xff'
383 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
384 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
385
386 def test_only_one_bom(self):
387 _,_,reader,writer = codecs.lookup(self.encoding)
388 # encode some stream
389 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200390 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000391 f.write("spam")
392 f.write("spam")
393 d = s.getvalue()
394 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000395 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000396 # try to read it back
397 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200398 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000399 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000400
401 def test_badbom(self):
402 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200403 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 self.assertRaises(UnicodeError, f.read)
405
406 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200407 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000408 self.assertRaises(UnicodeError, f.read)
409
410 def test_partial(self):
411 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200412 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000413 [
414 "", # first byte of BOM read
415 "", # second byte of BOM read
416 "", # third byte of BOM read
417 "", # fourth byte of BOM read => byteorder known
418 "",
419 "",
420 "",
421 "\x00",
422 "\x00",
423 "\x00",
424 "\x00",
425 "\x00\xff",
426 "\x00\xff",
427 "\x00\xff",
428 "\x00\xff",
429 "\x00\xff\u0100",
430 "\x00\xff\u0100",
431 "\x00\xff\u0100",
432 "\x00\xff\u0100",
433 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200434 "\x00\xff\u0100\uffff",
435 "\x00\xff\u0100\uffff",
436 "\x00\xff\u0100\uffff",
437 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 ]
439 )
440
Georg Brandl791f4e12009-09-17 11:41:24 +0000441 def test_handlers(self):
442 self.assertEqual(('\ufffd', 1),
443 codecs.utf_32_decode(b'\x01', 'replace', True))
444 self.assertEqual(('', 1),
445 codecs.utf_32_decode(b'\x01', 'ignore', True))
446
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 def test_errors(self):
448 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
449 b"\xff", "strict", True)
450
451 def test_decoder_state(self):
452 self.check_state_handling_decode(self.encoding,
453 "spamspam", self.spamle)
454 self.check_state_handling_decode(self.encoding,
455 "spamspam", self.spambe)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_decode(encoded_le)[0])
463 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
464 self.assertEqual('\U00010000' * 1024,
465 codecs.utf_32_decode(encoded_be)[0])
466
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200467class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000468 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200469 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000470
471 def test_partial(self):
472 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200473 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000474 [
475 "",
476 "",
477 "",
478 "\x00",
479 "\x00",
480 "\x00",
481 "\x00",
482 "\x00\xff",
483 "\x00\xff",
484 "\x00\xff",
485 "\x00\xff",
486 "\x00\xff\u0100",
487 "\x00\xff\u0100",
488 "\x00\xff\u0100",
489 "\x00\xff\u0100",
490 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200491 "\x00\xff\u0100\uffff",
492 "\x00\xff\u0100\uffff",
493 "\x00\xff\u0100\uffff",
494 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000495 ]
496 )
497
498 def test_simple(self):
499 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
500
501 def test_errors(self):
502 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
503 b"\xff", "strict", True)
504
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000505 def test_issue8941(self):
506 # Issue #8941: insufficient result allocation when decoding into
507 # surrogate pairs on UCS-2 builds.
508 encoded = b'\x00\x00\x01\x00' * 1024
509 self.assertEqual('\U00010000' * 1024,
510 codecs.utf_32_le_decode(encoded)[0])
511
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200512class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000513 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200514 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000515
516 def test_partial(self):
517 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200518 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000519 [
520 "",
521 "",
522 "",
523 "\x00",
524 "\x00",
525 "\x00",
526 "\x00",
527 "\x00\xff",
528 "\x00\xff",
529 "\x00\xff",
530 "\x00\xff",
531 "\x00\xff\u0100",
532 "\x00\xff\u0100",
533 "\x00\xff\u0100",
534 "\x00\xff\u0100",
535 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200536 "\x00\xff\u0100\uffff",
537 "\x00\xff\u0100\uffff",
538 "\x00\xff\u0100\uffff",
539 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000540 ]
541 )
542
543 def test_simple(self):
544 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
545
546 def test_errors(self):
547 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
548 b"\xff", "strict", True)
549
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000550 def test_issue8941(self):
551 # Issue #8941: insufficient result allocation when decoding into
552 # surrogate pairs on UCS-2 builds.
553 encoded = b'\x00\x01\x00\x00' * 1024
554 self.assertEqual('\U00010000' * 1024,
555 codecs.utf_32_be_decode(encoded)[0])
556
557
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200558class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000559 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200560 if sys.byteorder == 'little':
561 ill_formed_sequence = b"\x80\xdc"
562 else:
563 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000564
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000565 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
566 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000567
568 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000570 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000571 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200572 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000573 f.write("spam")
574 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000575 d = s.getvalue()
576 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000577 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000579 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200580 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000581 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000582
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000583 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000584 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200585 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000586 self.assertRaises(UnicodeError, f.read)
587
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000588 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200589 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000590 self.assertRaises(UnicodeError, f.read)
591
Walter Dörwald69652032004-09-07 20:24:22 +0000592 def test_partial(self):
593 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200594 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000595 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000596 "", # first byte of BOM read
597 "", # second byte of BOM read => byteorder known
598 "",
599 "\x00",
600 "\x00",
601 "\x00\xff",
602 "\x00\xff",
603 "\x00\xff\u0100",
604 "\x00\xff\u0100",
605 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200606 "\x00\xff\u0100\uffff",
607 "\x00\xff\u0100\uffff",
608 "\x00\xff\u0100\uffff",
609 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000610 ]
611 )
612
Georg Brandl791f4e12009-09-17 11:41:24 +0000613 def test_handlers(self):
614 self.assertEqual(('\ufffd', 1),
615 codecs.utf_16_decode(b'\x01', 'replace', True))
616 self.assertEqual(('', 1),
617 codecs.utf_16_decode(b'\x01', 'ignore', True))
618
Walter Dörwalde22d3392005-11-17 08:52:34 +0000619 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000620 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000621 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000622
623 def test_decoder_state(self):
624 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000625 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000626 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000627 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000628
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000629 def test_bug691291(self):
630 # Files are always opened in binary mode, even if no binary mode was
631 # specified. This means that no automatic conversion of '\n' is done
632 # on reading and writing.
633 s1 = 'Hello\r\nworld\r\n'
634
635 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200636 self.addCleanup(support.unlink, support.TESTFN)
637 with open(support.TESTFN, 'wb') as fp:
638 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200639 with support.check_warnings(('', DeprecationWarning)):
640 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
641 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200642 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000643
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200644class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000645 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200646 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000647
648 def test_partial(self):
649 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200650 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000651 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000652 "",
653 "\x00",
654 "\x00",
655 "\x00\xff",
656 "\x00\xff",
657 "\x00\xff\u0100",
658 "\x00\xff\u0100",
659 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200660 "\x00\xff\u0100\uffff",
661 "\x00\xff\u0100\uffff",
662 "\x00\xff\u0100\uffff",
663 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000664 ]
665 )
666
Walter Dörwalde22d3392005-11-17 08:52:34 +0000667 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200668 tests = [
669 (b'\xff', '\ufffd'),
670 (b'A\x00Z', 'A\ufffd'),
671 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
672 (b'\x00\xd8', '\ufffd'),
673 (b'\x00\xd8A', '\ufffd'),
674 (b'\x00\xd8A\x00', '\ufffdA'),
675 (b'\x00\xdcA\x00', '\ufffdA'),
676 ]
677 for raw, expected in tests:
678 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
679 raw, 'strict', True)
680 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681
Victor Stinner53a9dd72010-12-08 22:25:45 +0000682 def test_nonbmp(self):
683 self.assertEqual("\U00010203".encode(self.encoding),
684 b'\x00\xd8\x03\xde')
685 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
686 "\U00010203")
687
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200688class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000689 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200690 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000691
692 def test_partial(self):
693 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200694 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000695 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000696 "",
697 "\x00",
698 "\x00",
699 "\x00\xff",
700 "\x00\xff",
701 "\x00\xff\u0100",
702 "\x00\xff\u0100",
703 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200704 "\x00\xff\u0100\uffff",
705 "\x00\xff\u0100\uffff",
706 "\x00\xff\u0100\uffff",
707 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000708 ]
709 )
710
Walter Dörwalde22d3392005-11-17 08:52:34 +0000711 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200712 tests = [
713 (b'\xff', '\ufffd'),
714 (b'\x00A\xff', 'A\ufffd'),
715 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
716 (b'\xd8\x00', '\ufffd'),
717 (b'\xd8\x00\xdc', '\ufffd'),
718 (b'\xd8\x00\x00A', '\ufffdA'),
719 (b'\xdc\x00\x00A', '\ufffdA'),
720 ]
721 for raw, expected in tests:
722 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
723 raw, 'strict', True)
724 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000725
Victor Stinner53a9dd72010-12-08 22:25:45 +0000726 def test_nonbmp(self):
727 self.assertEqual("\U00010203".encode(self.encoding),
728 b'\xd8\x00\xde\x03')
729 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
730 "\U00010203")
731
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200732class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000733 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200734 ill_formed_sequence = b"\xed\xb2\x80"
735 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000736
737 def test_partial(self):
738 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200739 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000740 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000741 "\x00",
742 "\x00",
743 "\x00\xff",
744 "\x00\xff",
745 "\x00\xff\u07ff",
746 "\x00\xff\u07ff",
747 "\x00\xff\u07ff",
748 "\x00\xff\u07ff\u0800",
749 "\x00\xff\u07ff\u0800",
750 "\x00\xff\u07ff\u0800",
751 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200752 "\x00\xff\u07ff\u0800\uffff",
753 "\x00\xff\u07ff\u0800\uffff",
754 "\x00\xff\u07ff\u0800\uffff",
755 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000756 ]
757 )
758
Walter Dörwald3abcb012007-04-16 22:10:50 +0000759 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000760 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000761 self.check_state_handling_decode(self.encoding,
762 u, u.encode(self.encoding))
763
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000764 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200765 super().test_lone_surrogates()
766 # not sure if this is making sense for
767 # UTF-16 and UTF-32
768 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000769 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000770
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000771 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000772 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
773 b"abc\xed\xa0\x80def")
774 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
775 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200776 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
777 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
778 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
779 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000780 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700781 with self.assertRaises(UnicodeDecodeError):
782 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200783 with self.assertRaises(UnicodeDecodeError):
784 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000785
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200786@unittest.skipUnless(sys.platform == 'win32',
787 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200788class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200789 encoding = "cp65001"
790
791 def test_encode(self):
792 tests = [
793 ('abc', 'strict', b'abc'),
794 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
795 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
796 ]
797 if VISTA_OR_LATER:
798 tests.extend((
799 ('\udc80', 'strict', None),
800 ('\udc80', 'ignore', b''),
801 ('\udc80', 'replace', b'?'),
802 ('\udc80', 'backslashreplace', b'\\udc80'),
803 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
804 ))
805 else:
806 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
807 for text, errors, expected in tests:
808 if expected is not None:
809 try:
810 encoded = text.encode('cp65001', errors)
811 except UnicodeEncodeError as err:
812 self.fail('Unable to encode %a to cp65001 with '
813 'errors=%r: %s' % (text, errors, err))
814 self.assertEqual(encoded, expected,
815 '%a.encode("cp65001", %r)=%a != %a'
816 % (text, errors, encoded, expected))
817 else:
818 self.assertRaises(UnicodeEncodeError,
819 text.encode, "cp65001", errors)
820
821 def test_decode(self):
822 tests = [
823 (b'abc', 'strict', 'abc'),
824 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
825 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
826 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
827 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
828 # invalid bytes
829 (b'[\xff]', 'strict', None),
830 (b'[\xff]', 'ignore', '[]'),
831 (b'[\xff]', 'replace', '[\ufffd]'),
832 (b'[\xff]', 'surrogateescape', '[\udcff]'),
833 ]
834 if VISTA_OR_LATER:
835 tests.extend((
836 (b'[\xed\xb2\x80]', 'strict', None),
837 (b'[\xed\xb2\x80]', 'ignore', '[]'),
838 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
839 ))
840 else:
841 tests.extend((
842 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
843 ))
844 for raw, errors, expected in tests:
845 if expected is not None:
846 try:
847 decoded = raw.decode('cp65001', errors)
848 except UnicodeDecodeError as err:
849 self.fail('Unable to decode %a from cp65001 with '
850 'errors=%r: %s' % (raw, errors, err))
851 self.assertEqual(decoded, expected,
852 '%a.decode("cp65001", %r)=%a != %a'
853 % (raw, errors, decoded, expected))
854 else:
855 self.assertRaises(UnicodeDecodeError,
856 raw.decode, 'cp65001', errors)
857
858 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
859 def test_lone_surrogates(self):
860 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
861 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
862 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
863 b'[\\udc80]')
864 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
865 b'[&#56448;]')
866 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
867 b'[\x80]')
868 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
869 b'[]')
870 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
871 b'[?]')
872
873 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
874 def test_surrogatepass_handler(self):
875 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
876 b"abc\xed\xa0\x80def")
877 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
878 "abc\ud800def")
879 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
880 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
881 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
882 "\U00010fff\uD800")
883 self.assertTrue(codecs.lookup_error("surrogatepass"))
884
885
886
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200887class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000888 encoding = "utf-7"
889
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000890 def test_partial(self):
891 self.check_partial(
892 "a+-b",
893 [
894 "a",
895 "a",
896 "a+",
897 "a+-",
898 "a+-b",
899 ]
900 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000901
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300902 def test_errors(self):
903 tests = [
904 (b'a\xffb', 'a\ufffdb'),
905 (b'a+IK', 'a\ufffd'),
906 (b'a+IK-b', 'a\ufffdb'),
907 (b'a+IK,b', 'a\ufffdb'),
908 (b'a+IKx', 'a\u20ac\ufffd'),
909 (b'a+IKx-b', 'a\u20ac\ufffdb'),
910 (b'a+IKwgr', 'a\u20ac\ufffd'),
911 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
912 (b'a+IKwgr,', 'a\u20ac\ufffd'),
913 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
914 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
915 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
916 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
917 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
918 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
919 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
920 ]
921 for raw, expected in tests:
922 with self.subTest(raw=raw):
923 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
924 raw, 'strict', True)
925 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
926
927 def test_nonbmp(self):
928 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
929 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
930 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
931
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200932 test_lone_surrogates = None
933
934
Walter Dörwalde22d3392005-11-17 08:52:34 +0000935class UTF16ExTest(unittest.TestCase):
936
937 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000938 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939
940 def test_bad_args(self):
941 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
942
943class ReadBufferTest(unittest.TestCase):
944
945 def test_array(self):
946 import array
947 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000948 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000949 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000950 )
951
952 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000953 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000954
955 def test_bad_args(self):
956 self.assertRaises(TypeError, codecs.readbuffer_encode)
957 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
958
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200959class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000960 encoding = "utf-8-sig"
961
962 def test_partial(self):
963 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200964 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000965 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000966 "",
967 "",
968 "", # First BOM has been read and skipped
969 "",
970 "",
971 "\ufeff", # Second BOM has been read and emitted
972 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000973 "\ufeff\x00", # First byte of encoded "\xff" read
974 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
975 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
976 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000977 "\ufeff\x00\xff\u07ff",
978 "\ufeff\x00\xff\u07ff",
979 "\ufeff\x00\xff\u07ff\u0800",
980 "\ufeff\x00\xff\u07ff\u0800",
981 "\ufeff\x00\xff\u07ff\u0800",
982 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200983 "\ufeff\x00\xff\u07ff\u0800\uffff",
984 "\ufeff\x00\xff\u07ff\u0800\uffff",
985 "\ufeff\x00\xff\u07ff\u0800\uffff",
986 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000987 ]
988 )
989
Thomas Wouters89f507f2006-12-13 04:49:30 +0000990 def test_bug1601501(self):
991 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000992 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000993
Walter Dörwald3abcb012007-04-16 22:10:50 +0000994 def test_bom(self):
995 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000996 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000997 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
998
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000999 def test_stream_bom(self):
1000 unistring = "ABC\u00A1\u2200XYZ"
1001 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1002
1003 reader = codecs.getreader("utf-8-sig")
1004 for sizehint in [None] + list(range(1, 11)) + \
1005 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001006 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001007 ostream = io.StringIO()
1008 while 1:
1009 if sizehint is not None:
1010 data = istream.read(sizehint)
1011 else:
1012 data = istream.read()
1013
1014 if not data:
1015 break
1016 ostream.write(data)
1017
1018 got = ostream.getvalue()
1019 self.assertEqual(got, unistring)
1020
1021 def test_stream_bare(self):
1022 unistring = "ABC\u00A1\u2200XYZ"
1023 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1024
1025 reader = codecs.getreader("utf-8-sig")
1026 for sizehint in [None] + list(range(1, 11)) + \
1027 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001028 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001029 ostream = io.StringIO()
1030 while 1:
1031 if sizehint is not None:
1032 data = istream.read(sizehint)
1033 else:
1034 data = istream.read()
1035
1036 if not data:
1037 break
1038 ostream.write(data)
1039
1040 got = ostream.getvalue()
1041 self.assertEqual(got, unistring)
1042
1043class EscapeDecodeTest(unittest.TestCase):
1044 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001045 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001046
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001047 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001048 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001049 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001050 b = bytes([b])
1051 if b != b'\\':
1052 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001053
1054 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001055 decode = codecs.escape_decode
1056 check = coding_checker(self, decode)
1057 check(b"[\\\n]", b"[]")
1058 check(br'[\"]', b'["]')
1059 check(br"[\']", b"[']")
1060 check(br"[\\]", br"[\]")
1061 check(br"[\a]", b"[\x07]")
1062 check(br"[\b]", b"[\x08]")
1063 check(br"[\t]", b"[\x09]")
1064 check(br"[\n]", b"[\x0a]")
1065 check(br"[\v]", b"[\x0b]")
1066 check(br"[\f]", b"[\x0c]")
1067 check(br"[\r]", b"[\x0d]")
1068 check(br"[\7]", b"[\x07]")
1069 check(br"[\8]", br"[\8]")
1070 check(br"[\78]", b"[\x078]")
1071 check(br"[\41]", b"[!]")
1072 check(br"[\418]", b"[!8]")
1073 check(br"[\101]", b"[A]")
1074 check(br"[\1010]", b"[A0]")
1075 check(br"[\501]", b"[A]")
1076 check(br"[\x41]", b"[A]")
1077 check(br"[\X41]", br"[\X41]")
1078 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001079 for b in range(256):
1080 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001081 b = bytes([b])
1082 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001083
1084 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001085 decode = codecs.escape_decode
1086 self.assertRaises(ValueError, decode, br"\x")
1087 self.assertRaises(ValueError, decode, br"[\x]")
1088 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1089 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1090 self.assertRaises(ValueError, decode, br"\x0")
1091 self.assertRaises(ValueError, decode, br"[\x0]")
1092 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1093 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001094
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001095class RecodingTest(unittest.TestCase):
1096 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001097 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001098 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001099 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001100 f2.close()
1101 # Python used to crash on this at exit because of a refcount
1102 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001103
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104# From RFC 3492
1105punycode_testcases = [
1106 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001107 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1108 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001109 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001111 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001112 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001114 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001115 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001117 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1118 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1119 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001120 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001121 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001122 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1123 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1124 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001125 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001126 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001127 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001128 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1129 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1130 "\u0939\u0948\u0902",
1131 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001132
1133 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001135 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1136 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137
1138 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001139 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1140 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1141 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001142 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1143 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144
1145 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1147 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1148 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1149 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001150 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001151
1152 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1154 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1155 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1156 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1157 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001158 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159
1160 # (K) Vietnamese:
1161 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1162 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1164 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1165 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1166 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001167 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001168
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001170 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001171 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001172
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001174 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1175 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1176 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001177 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
1179 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1181 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1182 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001183 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001184
1185 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001186 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001187 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001188
1189 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001190 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1191 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001192 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193
1194 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001195 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001196 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197
1198 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001199 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001200 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001201
1202 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001203 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1204 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001205 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001206 ]
1207
1208for i in punycode_testcases:
1209 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001210 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001211
1212class PunycodeTest(unittest.TestCase):
1213 def test_encode(self):
1214 for uni, puny in punycode_testcases:
1215 # Need to convert both strings to lower case, since
1216 # some of the extended encodings use upper case, but our
1217 # code produces only lower case. Converting just puny to
1218 # lower is also insufficient, since some of the input characters
1219 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001220 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001221 str(uni.encode("punycode"), "ascii").lower(),
1222 str(puny, "ascii").lower()
1223 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224
1225 def test_decode(self):
1226 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001227 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001228 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001229 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001231class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001232 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001233 def test_bug1251300(self):
1234 # Decoding with unicode_internal used to not correctly handle "code
1235 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001236 ok = [
1237 (b"\x00\x10\xff\xff", "\U0010ffff"),
1238 (b"\x00\x00\x01\x01", "\U00000101"),
1239 (b"", ""),
1240 ]
1241 not_ok = [
1242 b"\x7f\xff\xff\xff",
1243 b"\x80\x00\x00\x00",
1244 b"\x81\x00\x00\x00",
1245 b"\x00",
1246 b"\x00\x00\x00\x00\x00",
1247 ]
1248 for internal, uni in ok:
1249 if sys.byteorder == "little":
1250 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001251 with support.check_warnings():
1252 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001253 for internal in not_ok:
1254 if sys.byteorder == "little":
1255 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001256 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001257 'deprecated', DeprecationWarning)):
1258 self.assertRaises(UnicodeDecodeError, internal.decode,
1259 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001260 if sys.byteorder == "little":
1261 invalid = b"\x00\x00\x11\x00"
1262 else:
1263 invalid = b"\x00\x11\x00\x00"
1264 with support.check_warnings():
1265 self.assertRaises(UnicodeDecodeError,
1266 invalid.decode, "unicode_internal")
1267 with support.check_warnings():
1268 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1269 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001270
Victor Stinner182d90d2011-09-29 19:53:55 +02001271 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001272 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001273 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001274 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001275 'deprecated', DeprecationWarning)):
1276 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001277 except UnicodeDecodeError as ex:
1278 self.assertEqual("unicode_internal", ex.encoding)
1279 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1280 self.assertEqual(4, ex.start)
1281 self.assertEqual(8, ex.end)
1282 else:
1283 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001284
Victor Stinner182d90d2011-09-29 19:53:55 +02001285 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001286 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001287 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1288 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001289 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001290 'deprecated', DeprecationWarning)):
1291 ab = "ab".encode("unicode_internal").decode()
1292 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1293 "ascii"),
1294 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001295 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001296
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001297 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001298 with support.check_warnings(('unicode_internal codec has been '
1299 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001300 # Issue 3739
1301 encoder = codecs.getencoder("unicode_internal")
1302 self.assertEqual(encoder("a")[1], 1)
1303 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1304
1305 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001306
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1308nameprep_tests = [
1309 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001310 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1311 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1312 b'\xb8\x8f\xef\xbb\xbf',
1313 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'CAFE',
1316 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 # 3.3 Case folding 8bit U+00DF (german sharp s).
1318 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 (b'\xc3\x9f',
1320 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001322 (b'\xc4\xb0',
1323 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001324 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001325 (b'\xc5\x83\xcd\xba',
1326 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1328 # XXX: skip this as it fails in UCS-2 mode
1329 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1330 # 'telc\xe2\x88\x95kg\xcf\x83'),
1331 (None, None),
1332 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1334 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001336 (b'\xe1\xbe\xb7',
1337 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001338 # 3.9 Self-reverting case folding U+01F0 and normalization.
1339 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001340 (b'\xc7\xb0',
1341 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001342 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001343 (b'\xce\x90',
1344 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001346 (b'\xce\xb0',
1347 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001349 (b'\xe1\xba\x96',
1350 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001351 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001352 (b'\xe1\xbd\x96',
1353 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001355 (b' ',
1356 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001358 (b'\xc2\xa0',
1359 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 None),
1363 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001364 (b'\xe2\x80\x80',
1365 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001367 (b'\xe2\x80\x8b',
1368 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'\xe3\x80\x80',
1371 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'\x10\x7f',
1374 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377 None),
1378 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001379 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001380 None),
1381 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001382 (b'\xef\xbb\xbf',
1383 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001384 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001385 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 None),
1387 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001388 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 None),
1390 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001391 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 None),
1393 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 None),
1396 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 None),
1399 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 None),
1402 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001403 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 None),
1405 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001406 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 None),
1408 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 None),
1411 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001412 (b'\xcd\x81',
1413 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001415 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 None),
1417 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001418 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001419 None),
1420 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001421 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001422 None),
1423 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001424 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 None),
1426 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001427 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001428 None),
1429 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001430 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001431 None),
1432 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001433 (b'foo\xef\xb9\xb6bar',
1434 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001436 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 None),
1438 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001439 (b'\xd8\xa71\xd8\xa8',
1440 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001442 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001444 # None),
1445 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001446 # 3.44 Larger test (shrinking).
1447 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001448 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1449 b'\xaa\xce\xb0\xe2\x80\x80',
1450 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 # 3.45 Larger test (expanding).
1452 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1454 b'\x80',
1455 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1456 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1457 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001458 ]
1459
1460
1461class NameprepTest(unittest.TestCase):
1462 def test_nameprep(self):
1463 from encodings.idna import nameprep
1464 for pos, (orig, prepped) in enumerate(nameprep_tests):
1465 if orig is None:
1466 # Skipped
1467 continue
1468 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001469 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 if prepped is None:
1471 # Input contains prohibited characters
1472 self.assertRaises(UnicodeError, nameprep, orig)
1473 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001474 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001476 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001477 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001478 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001480class IDNACodecTest(unittest.TestCase):
1481 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001482 self.assertEqual(str(b"python.org", "idna"), "python.org")
1483 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1484 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1485 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001486
1487 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001488 self.assertEqual("python.org".encode("idna"), b"python.org")
1489 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1490 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1491 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001492
Martin v. Löwis8b595142005-08-25 11:03:38 +00001493 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001494 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001495 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001496 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001497
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001498 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001499 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001500 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001501 "python.org"
1502 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001503 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001504 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001505 "python.org."
1506 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001507 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001508 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001509 "pyth\xf6n.org."
1510 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001511 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001512 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001513 "pyth\xf6n.org."
1514 )
1515
1516 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001517 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1518 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1519 self.assertEqual(decoder.decode(b"rg"), "")
1520 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001521
1522 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1524 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1525 self.assertEqual(decoder.decode(b"rg."), "org.")
1526 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001527
1528 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001529 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001530 b"".join(codecs.iterencode("python.org", "idna")),
1531 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001532 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001534 b"".join(codecs.iterencode("python.org.", "idna")),
1535 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001538 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1539 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001540 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001542 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1543 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001544 )
1545
1546 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(encoder.encode("\xe4x"), b"")
1548 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1549 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550
1551 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(encoder.encode("\xe4x"), b"")
1553 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1554 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001555
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001556class CodecsModuleTest(unittest.TestCase):
1557
1558 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001559 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1560 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001561 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001563 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001564
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001565 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001566 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1567 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001568 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001569 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001572
1573 def test_register(self):
1574 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001575 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001576
1577 def test_lookup(self):
1578 self.assertRaises(TypeError, codecs.lookup)
1579 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001580 self.assertRaises(LookupError, codecs.lookup, " ")
1581
1582 def test_getencoder(self):
1583 self.assertRaises(TypeError, codecs.getencoder)
1584 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1585
1586 def test_getdecoder(self):
1587 self.assertRaises(TypeError, codecs.getdecoder)
1588 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1589
1590 def test_getreader(self):
1591 self.assertRaises(TypeError, codecs.getreader)
1592 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1593
1594 def test_getwriter(self):
1595 self.assertRaises(TypeError, codecs.getwriter)
1596 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001597
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001598 def test_lookup_issue1813(self):
1599 # Issue #1813: under Turkish locales, lookup of some codecs failed
1600 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001601 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001602 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1603 try:
1604 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1605 except locale.Error:
1606 # Unsupported locale on this system
1607 self.skipTest('test needs Turkish locale')
1608 c = codecs.lookup('ASCII')
1609 self.assertEqual(c.name, 'ascii')
1610
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001611class StreamReaderTest(unittest.TestCase):
1612
1613 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001614 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001615 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001616
1617 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001618 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001619 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001620
Thomas Wouters89f507f2006-12-13 04:49:30 +00001621class EncodedFileTest(unittest.TestCase):
1622
1623 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001624 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001625 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001626 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001627
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001628 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001629 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001630 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001631 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001632
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001633all_unicode_encodings = [
1634 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001635 "big5",
1636 "big5hkscs",
1637 "charmap",
1638 "cp037",
1639 "cp1006",
1640 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001641 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001642 "cp1140",
1643 "cp1250",
1644 "cp1251",
1645 "cp1252",
1646 "cp1253",
1647 "cp1254",
1648 "cp1255",
1649 "cp1256",
1650 "cp1257",
1651 "cp1258",
1652 "cp424",
1653 "cp437",
1654 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001655 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001656 "cp737",
1657 "cp775",
1658 "cp850",
1659 "cp852",
1660 "cp855",
1661 "cp856",
1662 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001663 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001664 "cp860",
1665 "cp861",
1666 "cp862",
1667 "cp863",
1668 "cp864",
1669 "cp865",
1670 "cp866",
1671 "cp869",
1672 "cp874",
1673 "cp875",
1674 "cp932",
1675 "cp949",
1676 "cp950",
1677 "euc_jis_2004",
1678 "euc_jisx0213",
1679 "euc_jp",
1680 "euc_kr",
1681 "gb18030",
1682 "gb2312",
1683 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001684 "hp_roman8",
1685 "hz",
1686 "idna",
1687 "iso2022_jp",
1688 "iso2022_jp_1",
1689 "iso2022_jp_2",
1690 "iso2022_jp_2004",
1691 "iso2022_jp_3",
1692 "iso2022_jp_ext",
1693 "iso2022_kr",
1694 "iso8859_1",
1695 "iso8859_10",
1696 "iso8859_11",
1697 "iso8859_13",
1698 "iso8859_14",
1699 "iso8859_15",
1700 "iso8859_16",
1701 "iso8859_2",
1702 "iso8859_3",
1703 "iso8859_4",
1704 "iso8859_5",
1705 "iso8859_6",
1706 "iso8859_7",
1707 "iso8859_8",
1708 "iso8859_9",
1709 "johab",
1710 "koi8_r",
1711 "koi8_u",
1712 "latin_1",
1713 "mac_cyrillic",
1714 "mac_greek",
1715 "mac_iceland",
1716 "mac_latin2",
1717 "mac_roman",
1718 "mac_turkish",
1719 "palmos",
1720 "ptcp154",
1721 "punycode",
1722 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001723 "shift_jis",
1724 "shift_jis_2004",
1725 "shift_jisx0213",
1726 "tis_620",
1727 "unicode_escape",
1728 "unicode_internal",
1729 "utf_16",
1730 "utf_16_be",
1731 "utf_16_le",
1732 "utf_7",
1733 "utf_8",
1734]
1735
1736if hasattr(codecs, "mbcs_encode"):
1737 all_unicode_encodings.append("mbcs")
1738
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001739# The following encoding is not tested, because it's not supposed
1740# to work:
1741# "undefined"
1742
1743# The following encodings don't work in stateful mode
1744broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001745 "punycode",
1746 "unicode_internal"
1747]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001748broken_incremental_coders = broken_unicode_with_streams + [
1749 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001750]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001751
Walter Dörwald3abcb012007-04-16 22:10:50 +00001752class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001753 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001754 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001755 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001756 name = codecs.lookup(encoding).name
1757 if encoding.endswith("_codec"):
1758 name += "_codec"
1759 elif encoding == "latin_1":
1760 name = "latin_1"
1761 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001762
Ezio Melottiadc417c2011-11-17 12:23:34 +02001763 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001764 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001765 (b, size) = codecs.getencoder(encoding)(s)
1766 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1767 (chars, size) = codecs.getdecoder(encoding)(b)
1768 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001769
1770 if encoding not in broken_unicode_with_streams:
1771 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001772 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001773 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001774 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001775 for c in s:
1776 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001777 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001778 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001779 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001780 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001781 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001782 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001783 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001784 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001785 decodedresult += reader.read()
1786 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1787
Thomas Wouters89f507f2006-12-13 04:49:30 +00001788 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001789 # check incremental decoder/encoder (fetched via the Python
1790 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001791 try:
1792 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001793 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001794 except LookupError: # no IncrementalEncoder
1795 pass
1796 else:
1797 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001798 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001799 for c in s:
1800 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001801 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001802 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001803 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001804 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001805 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001806 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001807 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1808
1809 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001810 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001811 for c in s:
1812 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001813 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001814 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001815 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001816 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001817 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001818 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001819 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1820
1821 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001822 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001823 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1824
1825 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001826 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1827 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001828
Victor Stinner554f3f02010-06-16 23:33:54 +00001829 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001830 # check incremental decoder/encoder with errors argument
1831 try:
1832 encoder = codecs.getincrementalencoder(encoding)("ignore")
1833 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1834 except LookupError: # no IncrementalEncoder
1835 pass
1836 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001837 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001838 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001839 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001840 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1841
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001842 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001843 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001844 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001845 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1846
Walter Dörwald729c31f2005-03-14 19:06:30 +00001847 def test_seek(self):
1848 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001849 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001850 for encoding in all_unicode_encodings:
1851 if encoding == "idna": # FIXME: See SF bug #1163178
1852 continue
1853 if encoding in broken_unicode_with_streams:
1854 continue
Victor Stinner05010702011-05-27 16:50:40 +02001855 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001856 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001857 # Test that calling seek resets the internal codec state and buffers
1858 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001859 data = reader.read()
1860 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001861
Walter Dörwalde22d3392005-11-17 08:52:34 +00001862 def test_bad_decode_args(self):
1863 for encoding in all_unicode_encodings:
1864 decoder = codecs.getdecoder(encoding)
1865 self.assertRaises(TypeError, decoder)
1866 if encoding not in ("idna", "punycode"):
1867 self.assertRaises(TypeError, decoder, 42)
1868
1869 def test_bad_encode_args(self):
1870 for encoding in all_unicode_encodings:
1871 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001872 with support.check_warnings():
1873 # unicode-internal has been deprecated
1874 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001875
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001876 def test_encoding_map_type_initialized(self):
1877 from encodings import cp1140
1878 # This used to crash, we are only verifying there's no crash.
1879 table_type = type(cp1140.encoding_table)
1880 self.assertEqual(table_type, table_type)
1881
Walter Dörwald3abcb012007-04-16 22:10:50 +00001882 def test_decoder_state(self):
1883 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001884 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001885 for encoding in all_unicode_encodings:
1886 if encoding not in broken_incremental_coders:
1887 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1888 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1889
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001890class CharmapTest(unittest.TestCase):
1891 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001892 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001893 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001894 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001895 )
1896
Ezio Melottib3aedd42010-11-20 19:04:17 +00001897 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001898 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1899 ("\U0010FFFFbc", 3)
1900 )
1901
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001902 self.assertRaises(UnicodeDecodeError,
1903 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1904 )
1905
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001906 self.assertRaises(UnicodeDecodeError,
1907 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1908 )
1909
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001910 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001911 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001912 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001913 )
1914
Ezio Melottib3aedd42010-11-20 19:04:17 +00001915 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001916 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001917 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001918 )
1919
Ezio Melottib3aedd42010-11-20 19:04:17 +00001920 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001921 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001922 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001923 )
1924
Ezio Melottib3aedd42010-11-20 19:04:17 +00001925 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001926 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001927 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001928 )
1929
Guido van Rossum805365e2007-05-07 22:24:25 +00001930 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001931 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001932 codecs.charmap_decode(allbytes, "ignore", ""),
1933 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001934 )
1935
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001936 def test_decode_with_int2str_map(self):
1937 self.assertEqual(
1938 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1939 {0: 'a', 1: 'b', 2: 'c'}),
1940 ("abc", 3)
1941 )
1942
1943 self.assertEqual(
1944 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1945 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1946 ("AaBbCc", 3)
1947 )
1948
1949 self.assertEqual(
1950 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1951 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1952 ("\U0010FFFFbc", 3)
1953 )
1954
1955 self.assertEqual(
1956 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1957 {0: 'a', 1: 'b', 2: ''}),
1958 ("ab", 3)
1959 )
1960
1961 self.assertRaises(UnicodeDecodeError,
1962 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1963 {0: 'a', 1: 'b'}
1964 )
1965
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001966 self.assertRaises(UnicodeDecodeError,
1967 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1968 {0: 'a', 1: 'b', 2: None}
1969 )
1970
1971 # Issue #14850
1972 self.assertRaises(UnicodeDecodeError,
1973 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1974 {0: 'a', 1: 'b', 2: '\ufffe'}
1975 )
1976
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001977 self.assertEqual(
1978 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1979 {0: 'a', 1: 'b'}),
1980 ("ab\ufffd", 3)
1981 )
1982
1983 self.assertEqual(
1984 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1985 {0: 'a', 1: 'b', 2: None}),
1986 ("ab\ufffd", 3)
1987 )
1988
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001989 # Issue #14850
1990 self.assertEqual(
1991 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1992 {0: 'a', 1: 'b', 2: '\ufffe'}),
1993 ("ab\ufffd", 3)
1994 )
1995
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001996 self.assertEqual(
1997 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1998 {0: 'a', 1: 'b'}),
1999 ("ab", 3)
2000 )
2001
2002 self.assertEqual(
2003 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2004 {0: 'a', 1: 'b', 2: None}),
2005 ("ab", 3)
2006 )
2007
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002008 # Issue #14850
2009 self.assertEqual(
2010 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2011 {0: 'a', 1: 'b', 2: '\ufffe'}),
2012 ("ab", 3)
2013 )
2014
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002015 allbytes = bytes(range(256))
2016 self.assertEqual(
2017 codecs.charmap_decode(allbytes, "ignore", {}),
2018 ("", len(allbytes))
2019 )
2020
2021 def test_decode_with_int2int_map(self):
2022 a = ord('a')
2023 b = ord('b')
2024 c = ord('c')
2025
2026 self.assertEqual(
2027 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2028 {0: a, 1: b, 2: c}),
2029 ("abc", 3)
2030 )
2031
2032 # Issue #15379
2033 self.assertEqual(
2034 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2035 {0: 0x10FFFF, 1: b, 2: c}),
2036 ("\U0010FFFFbc", 3)
2037 )
2038
Antoine Pitroua1f76552012-09-23 20:00:04 +02002039 self.assertEqual(
2040 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2041 {0: sys.maxunicode, 1: b, 2: c}),
2042 (chr(sys.maxunicode) + "bc", 3)
2043 )
2044
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002045 self.assertRaises(TypeError,
2046 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002047 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002048 )
2049
2050 self.assertRaises(UnicodeDecodeError,
2051 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2052 {0: a, 1: b},
2053 )
2054
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002055 self.assertRaises(UnicodeDecodeError,
2056 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2057 {0: a, 1: b, 2: 0xFFFE},
2058 )
2059
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002060 self.assertEqual(
2061 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2062 {0: a, 1: b}),
2063 ("ab\ufffd", 3)
2064 )
2065
2066 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002067 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2068 {0: a, 1: b, 2: 0xFFFE}),
2069 ("ab\ufffd", 3)
2070 )
2071
2072 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002073 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2074 {0: a, 1: b}),
2075 ("ab", 3)
2076 )
2077
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002078 self.assertEqual(
2079 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2080 {0: a, 1: b, 2: 0xFFFE}),
2081 ("ab", 3)
2082 )
2083
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002084
Thomas Wouters89f507f2006-12-13 04:49:30 +00002085class WithStmtTest(unittest.TestCase):
2086 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002087 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002088 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2089 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002090
2091 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002092 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002093 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002094 with codecs.StreamReaderWriter(f, info.streamreader,
2095 info.streamwriter, 'strict') as srw:
2096 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002097
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002098class TypesTest(unittest.TestCase):
2099 def test_decode_unicode(self):
2100 # Most decoders don't accept unicode input
2101 decoders = [
2102 codecs.utf_7_decode,
2103 codecs.utf_8_decode,
2104 codecs.utf_16_le_decode,
2105 codecs.utf_16_be_decode,
2106 codecs.utf_16_ex_decode,
2107 codecs.utf_32_decode,
2108 codecs.utf_32_le_decode,
2109 codecs.utf_32_be_decode,
2110 codecs.utf_32_ex_decode,
2111 codecs.latin_1_decode,
2112 codecs.ascii_decode,
2113 codecs.charmap_decode,
2114 ]
2115 if hasattr(codecs, "mbcs_decode"):
2116 decoders.append(codecs.mbcs_decode)
2117 for decoder in decoders:
2118 self.assertRaises(TypeError, decoder, "xxx")
2119
2120 def test_unicode_escape(self):
2121 # Escape-decoding an unicode string is supported ang gives the same
2122 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002123 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2124 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2125 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2126 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002127
Victor Stinnere3b47152011-12-09 20:49:49 +01002128 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2129 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2130
2131 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2132 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2133
Serhiy Storchakad6793772013-01-29 10:20:44 +02002134
2135class UnicodeEscapeTest(unittest.TestCase):
2136 def test_empty(self):
2137 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2138 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2139
2140 def test_raw_encode(self):
2141 encode = codecs.unicode_escape_encode
2142 for b in range(32, 127):
2143 if b != b'\\'[0]:
2144 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2145
2146 def test_raw_decode(self):
2147 decode = codecs.unicode_escape_decode
2148 for b in range(256):
2149 if b != b'\\'[0]:
2150 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2151
2152 def test_escape_encode(self):
2153 encode = codecs.unicode_escape_encode
2154 check = coding_checker(self, encode)
2155 check('\t', br'\t')
2156 check('\n', br'\n')
2157 check('\r', br'\r')
2158 check('\\', br'\\')
2159 for b in range(32):
2160 if chr(b) not in '\t\n\r':
2161 check(chr(b), ('\\x%02x' % b).encode())
2162 for b in range(127, 256):
2163 check(chr(b), ('\\x%02x' % b).encode())
2164 check('\u20ac', br'\u20ac')
2165 check('\U0001d120', br'\U0001d120')
2166
2167 def test_escape_decode(self):
2168 decode = codecs.unicode_escape_decode
2169 check = coding_checker(self, decode)
2170 check(b"[\\\n]", "[]")
2171 check(br'[\"]', '["]')
2172 check(br"[\']", "[']")
2173 check(br"[\\]", r"[\]")
2174 check(br"[\a]", "[\x07]")
2175 check(br"[\b]", "[\x08]")
2176 check(br"[\t]", "[\x09]")
2177 check(br"[\n]", "[\x0a]")
2178 check(br"[\v]", "[\x0b]")
2179 check(br"[\f]", "[\x0c]")
2180 check(br"[\r]", "[\x0d]")
2181 check(br"[\7]", "[\x07]")
2182 check(br"[\8]", r"[\8]")
2183 check(br"[\78]", "[\x078]")
2184 check(br"[\41]", "[!]")
2185 check(br"[\418]", "[!8]")
2186 check(br"[\101]", "[A]")
2187 check(br"[\1010]", "[A0]")
2188 check(br"[\x41]", "[A]")
2189 check(br"[\x410]", "[A0]")
2190 check(br"\u20ac", "\u20ac")
2191 check(br"\U0001d120", "\U0001d120")
2192 for b in range(256):
2193 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2194 check(b'\\' + bytes([b]), '\\' + chr(b))
2195
2196 def test_decode_errors(self):
2197 decode = codecs.unicode_escape_decode
2198 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2199 for i in range(d):
2200 self.assertRaises(UnicodeDecodeError, decode,
2201 b"\\" + c + b"0"*i)
2202 self.assertRaises(UnicodeDecodeError, decode,
2203 b"[\\" + c + b"0"*i + b"]")
2204 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2205 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2206 self.assertEqual(decode(data, "replace"),
2207 ("[\ufffd]\ufffd", len(data)))
2208 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2209 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2210 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2211
2212
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002213class RawUnicodeEscapeTest(unittest.TestCase):
2214 def test_empty(self):
2215 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2216 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2217
2218 def test_raw_encode(self):
2219 encode = codecs.raw_unicode_escape_encode
2220 for b in range(256):
2221 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2222
2223 def test_raw_decode(self):
2224 decode = codecs.raw_unicode_escape_decode
2225 for b in range(256):
2226 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2227
2228 def test_escape_encode(self):
2229 encode = codecs.raw_unicode_escape_encode
2230 check = coding_checker(self, encode)
2231 for b in range(256):
2232 if b not in b'uU':
2233 check('\\' + chr(b), b'\\' + bytes([b]))
2234 check('\u20ac', br'\u20ac')
2235 check('\U0001d120', br'\U0001d120')
2236
2237 def test_escape_decode(self):
2238 decode = codecs.raw_unicode_escape_decode
2239 check = coding_checker(self, decode)
2240 for b in range(256):
2241 if b not in b'uU':
2242 check(b'\\' + bytes([b]), '\\' + chr(b))
2243 check(br"\u20ac", "\u20ac")
2244 check(br"\U0001d120", "\U0001d120")
2245
2246 def test_decode_errors(self):
2247 decode = codecs.raw_unicode_escape_decode
2248 for c, d in (b'u', 4), (b'U', 4):
2249 for i in range(d):
2250 self.assertRaises(UnicodeDecodeError, decode,
2251 b"\\" + c + b"0"*i)
2252 self.assertRaises(UnicodeDecodeError, decode,
2253 b"[\\" + c + b"0"*i + b"]")
2254 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2255 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2256 self.assertEqual(decode(data, "replace"),
2257 ("[\ufffd]\ufffd", len(data)))
2258 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2259 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2260 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2261
2262
Martin v. Löwis43c57782009-05-10 08:15:24 +00002263class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002264
2265 def test_utf8(self):
2266 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002267 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002268 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002269 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002270 b"foo\x80bar")
2271 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002272 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002273 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002274 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002275 b"\xed\xb0\x80")
2276
2277 def test_ascii(self):
2278 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002279 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002280 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002281 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002282 b"foo\x80bar")
2283
2284 def test_charmap(self):
2285 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002286 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002287 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002288 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002289 b"foo\xa5bar")
2290
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002291 def test_latin1(self):
2292 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002293 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002294 b"\xe4\xeb\xef\xf6\xfc")
2295
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002296
Victor Stinner3fed0872010-05-22 02:16:27 +00002297class BomTest(unittest.TestCase):
2298 def test_seek0(self):
2299 data = "1234567890"
2300 tests = ("utf-16",
2301 "utf-16-le",
2302 "utf-16-be",
2303 "utf-32",
2304 "utf-32-le",
2305 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002306 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002307 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002308 # Check if the BOM is written only once
2309 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002310 f.write(data)
2311 f.write(data)
2312 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002313 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002314 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002315 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002316
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002317 # Check that the BOM is written after a seek(0)
2318 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2319 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002320 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002321 f.seek(0)
2322 f.write(data)
2323 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002324 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002325
2326 # (StreamWriter) Check that the BOM is written after a seek(0)
2327 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002328 f.writer.write(data[0])
2329 self.assertNotEqual(f.writer.tell(), 0)
2330 f.writer.seek(0)
2331 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002332 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002333 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002334
Victor Stinner05010702011-05-27 16:50:40 +02002335 # Check that the BOM is not written after a seek() at a position
2336 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002337 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2338 f.write(data)
2339 f.seek(f.tell())
2340 f.write(data)
2341 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002342 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002343
Victor Stinner05010702011-05-27 16:50:40 +02002344 # (StreamWriter) Check that the BOM is not written after a seek()
2345 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002346 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002347 f.writer.write(data)
2348 f.writer.seek(f.writer.tell())
2349 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002350 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002351 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002352
Victor Stinner3fed0872010-05-22 02:16:27 +00002353
Georg Brandl02524622010-12-02 18:06:51 +00002354bytes_transform_encodings = [
2355 "base64_codec",
2356 "uu_codec",
2357 "quopri_codec",
2358 "hex_codec",
2359]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002360
2361transform_aliases = {
2362 "base64_codec": ["base64", "base_64"],
2363 "uu_codec": ["uu"],
2364 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2365 "hex_codec": ["hex"],
2366 "rot_13": ["rot13"],
2367}
2368
Georg Brandl02524622010-12-02 18:06:51 +00002369try:
2370 import zlib
2371except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002372 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002373else:
2374 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002375 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002376try:
2377 import bz2
2378except ImportError:
2379 pass
2380else:
2381 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002382 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002383
2384class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002385
Georg Brandl02524622010-12-02 18:06:51 +00002386 def test_basics(self):
2387 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002388 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002389 with self.subTest(encoding=encoding):
2390 # generic codecs interface
2391 (o, size) = codecs.getencoder(encoding)(binput)
2392 self.assertEqual(size, len(binput))
2393 (i, size) = codecs.getdecoder(encoding)(o)
2394 self.assertEqual(size, len(o))
2395 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002396
Georg Brandl02524622010-12-02 18:06:51 +00002397 def test_read(self):
2398 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002399 with self.subTest(encoding=encoding):
2400 sin = codecs.encode(b"\x80", encoding)
2401 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2402 sout = reader.read()
2403 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002404
2405 def test_readline(self):
2406 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002407 with self.subTest(encoding=encoding):
2408 sin = codecs.encode(b"\x80", encoding)
2409 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2410 sout = reader.readline()
2411 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002412
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002413 def test_buffer_api_usage(self):
2414 # We check all the transform codecs accept memoryview input
2415 # for encoding and decoding
2416 # and also that they roundtrip correctly
2417 original = b"12345\x80"
2418 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002419 with self.subTest(encoding=encoding):
2420 data = original
2421 view = memoryview(data)
2422 data = codecs.encode(data, encoding)
2423 view_encoded = codecs.encode(view, encoding)
2424 self.assertEqual(view_encoded, data)
2425 view = memoryview(data)
2426 data = codecs.decode(data, encoding)
2427 self.assertEqual(data, original)
2428 view_decoded = codecs.decode(view, encoding)
2429 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002430
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002431 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002432 # Check binary -> binary codecs give a good error for str input
2433 bad_input = "bad input type"
2434 for encoding in bytes_transform_encodings:
2435 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002436 fmt = ( "{!r} is not a text encoding; "
2437 "use codecs.encode\(\) to handle arbitrary codecs")
2438 msg = fmt.format(encoding)
2439 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002440 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002441 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002442
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002443 def test_text_to_binary_blacklists_text_transforms(self):
2444 # Check str.encode gives a good error message for str -> str codecs
2445 msg = (r"^'rot_13' is not a text encoding; "
2446 "use codecs.encode\(\) to handle arbitrary codecs")
2447 with self.assertRaisesRegex(LookupError, msg):
2448 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002449
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002450 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002451 # Check bytes.decode and bytearray.decode give a good error
2452 # message for binary -> binary codecs
2453 data = b"encode first to ensure we meet any format restrictions"
2454 for encoding in bytes_transform_encodings:
2455 with self.subTest(encoding=encoding):
2456 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002457 fmt = (r"{!r} is not a text encoding; "
2458 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002459 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002460 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002461 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002462 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002463 bytearray(encoded_data).decode(encoding)
2464
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002465 def test_binary_to_text_blacklists_text_transforms(self):
2466 # Check str -> str codec gives a good error for binary input
2467 for bad_input in (b"immutable", bytearray(b"mutable")):
2468 with self.subTest(bad_input=bad_input):
2469 msg = (r"^'rot_13' is not a text encoding; "
2470 "use codecs.decode\(\) to handle arbitrary codecs")
2471 with self.assertRaisesRegex(LookupError, msg) as failure:
2472 bad_input.decode("rot_13")
2473 self.assertIsNone(failure.exception.__cause__)
2474
Zachary Wareefa2e042013-12-30 14:54:11 -06002475 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002476 def test_custom_zlib_error_is_wrapped(self):
2477 # Check zlib codec gives a good error for malformed input
2478 msg = "^decoding with 'zlib_codec' codec failed"
2479 with self.assertRaisesRegex(Exception, msg) as failure:
2480 codecs.decode(b"hello", "zlib_codec")
2481 self.assertIsInstance(failure.exception.__cause__,
2482 type(failure.exception))
2483
2484 def test_custom_hex_error_is_wrapped(self):
2485 # Check hex codec gives a good error for malformed input
2486 msg = "^decoding with 'hex_codec' codec failed"
2487 with self.assertRaisesRegex(Exception, msg) as failure:
2488 codecs.decode(b"hello", "hex_codec")
2489 self.assertIsInstance(failure.exception.__cause__,
2490 type(failure.exception))
2491
2492 # Unfortunately, the bz2 module throws OSError, which the codec
2493 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002494
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002495 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2496 def test_aliases(self):
2497 for codec_name, aliases in transform_aliases.items():
2498 expected_name = codecs.lookup(codec_name).name
2499 for alias in aliases:
2500 with self.subTest(alias=alias):
2501 info = codecs.lookup(alias)
2502 self.assertEqual(info.name, expected_name)
2503
Nick Coghlan8b097b42013-11-13 23:49:21 +10002504
2505# The codec system tries to wrap exceptions in order to ensure the error
2506# mentions the operation being performed and the codec involved. We
2507# currently *only* want this to happen for relatively stateless
2508# exceptions, where the only significant information they contain is their
2509# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002510
2511# Use a local codec registry to avoid appearing to leak objects when
2512# registering multiple seach functions
2513_TEST_CODECS = {}
2514
2515def _get_test_codec(codec_name):
2516 return _TEST_CODECS.get(codec_name)
2517codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2518
Nick Coghlan8b097b42013-11-13 23:49:21 +10002519class ExceptionChainingTest(unittest.TestCase):
2520
2521 def setUp(self):
2522 # There's no way to unregister a codec search function, so we just
2523 # ensure we render this one fairly harmless after the test
2524 # case finishes by using the test case repr as the codec name
2525 # The codecs module normalizes codec names, although this doesn't
2526 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002527 # We also make sure we use a truly unique id for the custom codec
2528 # to avoid issues with the codec cache when running these tests
2529 # multiple times (e.g. when hunting for refleaks)
2530 unique_id = repr(self) + str(id(self))
2531 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2532
2533 # We store the object to raise on the instance because of a bad
2534 # interaction between the codec caching (which means we can't
2535 # recreate the codec entry) and regrtest refleak hunting (which
2536 # runs the same test instance multiple times). This means we
2537 # need to ensure the codecs call back in to the instance to find
2538 # out which exception to raise rather than binding them in a
2539 # closure to an object that may change on the next run
2540 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002541
Nick Coghlan4e553e22013-11-16 00:35:34 +10002542 def tearDown(self):
2543 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002544
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002545 def set_codec(self, encode, decode):
2546 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002547 name=self.codec_name)
2548 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002549
2550 @contextlib.contextmanager
2551 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002552 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002553 operation, self.codec_name, exc_type.__name__, msg)
2554 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2555 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002556 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002557 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002558
2559 def raise_obj(self, *args, **kwds):
2560 # Helper to dynamically change the object raised by a test codec
2561 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002562
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002563 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002564 self.obj_to_raise = obj_to_raise
2565 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002566 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002567 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002568 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002569 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002570 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002571 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002572 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002573 codecs.decode(b"bytes input", self.codec_name)
2574
2575 def test_raise_by_type(self):
2576 self.check_wrapped(RuntimeError, "")
2577
2578 def test_raise_by_value(self):
2579 msg = "This should be wrapped"
2580 self.check_wrapped(RuntimeError(msg), msg)
2581
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002582 def test_raise_grandchild_subclass_exact_size(self):
2583 msg = "This should be wrapped"
2584 class MyRuntimeError(RuntimeError):
2585 __slots__ = ()
2586 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2587
2588 def test_raise_subclass_with_weakref_support(self):
2589 msg = "This should be wrapped"
2590 class MyRuntimeError(RuntimeError):
2591 pass
2592 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2593
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002594 def check_not_wrapped(self, obj_to_raise, msg):
2595 def raise_obj(*args, **kwds):
2596 raise obj_to_raise
2597 self.set_codec(raise_obj, raise_obj)
2598 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002599 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002600 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002601 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002602 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002603 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002604 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002605 codecs.decode(b"bytes input", self.codec_name)
2606
2607 def test_init_override_is_not_wrapped(self):
2608 class CustomInit(RuntimeError):
2609 def __init__(self):
2610 pass
2611 self.check_not_wrapped(CustomInit, "")
2612
2613 def test_new_override_is_not_wrapped(self):
2614 class CustomNew(RuntimeError):
2615 def __new__(cls):
2616 return super().__new__(cls)
2617 self.check_not_wrapped(CustomNew, "")
2618
2619 def test_instance_attribute_is_not_wrapped(self):
2620 msg = "This should NOT be wrapped"
2621 exc = RuntimeError(msg)
2622 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002623 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002624
2625 def test_non_str_arg_is_not_wrapped(self):
2626 self.check_not_wrapped(RuntimeError(1), "1")
2627
2628 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002629 msg_re = r"^\('a', 'b', 'c'\)$"
2630 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002631
2632 # http://bugs.python.org/issue19609
2633 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002634 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002635 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002636 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002637 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002638 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002639 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002641 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002642 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002643 codecs.decode(b"bytes input", self.codec_name)
2644
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 def test_unflagged_non_text_codec_handling(self):
2646 # The stdlib non-text codecs are now marked so they're
2647 # pre-emptively skipped by the text model related methods
2648 # However, third party codecs won't be flagged, so we still make
2649 # sure the case where an inappropriate output type is produced is
2650 # handled appropriately
2651 def encode_to_str(*args, **kwds):
2652 return "not bytes!", 0
2653 def decode_to_bytes(*args, **kwds):
2654 return b"not str!", 0
2655 self.set_codec(encode_to_str, decode_to_bytes)
2656 # No input or output type checks on the codecs module functions
2657 encoded = codecs.encode(None, self.codec_name)
2658 self.assertEqual(encoded, "not bytes!")
2659 decoded = codecs.decode(None, self.codec_name)
2660 self.assertEqual(decoded, b"not str!")
2661 # Text model methods should complain
2662 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2663 "use codecs.encode\(\) to encode to arbitrary types$")
2664 msg = fmt.format(self.codec_name)
2665 with self.assertRaisesRegex(TypeError, msg):
2666 "str_input".encode(self.codec_name)
2667 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2668 "use codecs.decode\(\) to decode to arbitrary types$")
2669 msg = fmt.format(self.codec_name)
2670 with self.assertRaisesRegex(TypeError, msg):
2671 b"bytes input".decode(self.codec_name)
2672
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002673
Georg Brandl02524622010-12-02 18:06:51 +00002674
Victor Stinner62be4fb2011-10-18 21:46:37 +02002675@unittest.skipUnless(sys.platform == 'win32',
2676 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002677class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002678 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002679 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002680
Victor Stinner3a50e702011-10-18 21:21:00 +02002681 def test_invalid_code_page(self):
2682 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2683 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002684 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2685 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002686
2687 def test_code_page_name(self):
2688 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2689 codecs.code_page_encode, 932, '\xff')
2690 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2691 codecs.code_page_decode, 932, b'\x81\x00')
2692 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2693 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2694
2695 def check_decode(self, cp, tests):
2696 for raw, errors, expected in tests:
2697 if expected is not None:
2698 try:
2699 decoded = codecs.code_page_decode(cp, raw, errors)
2700 except UnicodeDecodeError as err:
2701 self.fail('Unable to decode %a from "cp%s" with '
2702 'errors=%r: %s' % (raw, cp, errors, err))
2703 self.assertEqual(decoded[0], expected,
2704 '%a.decode("cp%s", %r)=%a != %a'
2705 % (raw, cp, errors, decoded[0], expected))
2706 # assert 0 <= decoded[1] <= len(raw)
2707 self.assertGreaterEqual(decoded[1], 0)
2708 self.assertLessEqual(decoded[1], len(raw))
2709 else:
2710 self.assertRaises(UnicodeDecodeError,
2711 codecs.code_page_decode, cp, raw, errors)
2712
2713 def check_encode(self, cp, tests):
2714 for text, errors, expected in tests:
2715 if expected is not None:
2716 try:
2717 encoded = codecs.code_page_encode(cp, text, errors)
2718 except UnicodeEncodeError as err:
2719 self.fail('Unable to encode %a to "cp%s" with '
2720 'errors=%r: %s' % (text, cp, errors, err))
2721 self.assertEqual(encoded[0], expected,
2722 '%a.encode("cp%s", %r)=%a != %a'
2723 % (text, cp, errors, encoded[0], expected))
2724 self.assertEqual(encoded[1], len(text))
2725 else:
2726 self.assertRaises(UnicodeEncodeError,
2727 codecs.code_page_encode, cp, text, errors)
2728
2729 def test_cp932(self):
2730 self.check_encode(932, (
2731 ('abc', 'strict', b'abc'),
2732 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002733 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002734 ('\xff', 'strict', None),
2735 ('[\xff]', 'ignore', b'[]'),
2736 ('[\xff]', 'replace', b'[y]'),
2737 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002738 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2739 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002740 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002741 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002742 (b'abc', 'strict', 'abc'),
2743 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2744 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002745 (b'[\xff]', 'strict', None),
2746 (b'[\xff]', 'ignore', '[]'),
2747 (b'[\xff]', 'replace', '[\ufffd]'),
2748 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002749 (b'\x81\x00abc', 'strict', None),
2750 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002751 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2752 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002753
2754 def test_cp1252(self):
2755 self.check_encode(1252, (
2756 ('abc', 'strict', b'abc'),
2757 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2758 ('\xff', 'strict', b'\xff'),
2759 ('\u0141', 'strict', None),
2760 ('\u0141', 'ignore', b''),
2761 ('\u0141', 'replace', b'L'),
2762 ))
2763 self.check_decode(1252, (
2764 (b'abc', 'strict', 'abc'),
2765 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2766 (b'\xff', 'strict', '\xff'),
2767 ))
2768
2769 def test_cp_utf7(self):
2770 cp = 65000
2771 self.check_encode(cp, (
2772 ('abc', 'strict', b'abc'),
2773 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2774 ('\U0010ffff', 'strict', b'+2//f/w-'),
2775 ('\udc80', 'strict', b'+3IA-'),
2776 ('\ufffd', 'strict', b'+//0-'),
2777 ))
2778 self.check_decode(cp, (
2779 (b'abc', 'strict', 'abc'),
2780 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2781 (b'+2//f/w-', 'strict', '\U0010ffff'),
2782 (b'+3IA-', 'strict', '\udc80'),
2783 (b'+//0-', 'strict', '\ufffd'),
2784 # invalid bytes
2785 (b'[+/]', 'strict', '[]'),
2786 (b'[\xff]', 'strict', '[\xff]'),
2787 ))
2788
Victor Stinner3a50e702011-10-18 21:21:00 +02002789 def test_multibyte_encoding(self):
2790 self.check_decode(932, (
2791 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2792 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2793 ))
2794 self.check_decode(self.CP_UTF8, (
2795 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2796 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2797 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002798 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002799 self.check_encode(self.CP_UTF8, (
2800 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2801 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2802 ))
2803
2804 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002805 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2806 self.assertEqual(decoded, ('', 0))
2807
Victor Stinner3a50e702011-10-18 21:21:00 +02002808 decoded = codecs.code_page_decode(932,
2809 b'\xe9\x80\xe9', 'strict',
2810 False)
2811 self.assertEqual(decoded, ('\u9a3e', 2))
2812
2813 decoded = codecs.code_page_decode(932,
2814 b'\xe9\x80\xe9\x80', 'strict',
2815 False)
2816 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2817
2818 decoded = codecs.code_page_decode(932,
2819 b'abc', 'strict',
2820 False)
2821 self.assertEqual(decoded, ('abc', 3))
2822
2823
Fred Drake2e2be372001-09-20 21:33:42 +00002824if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002825 unittest.main()