blob: 856126c4a45995d6a2fe2f5360adb01c870321e5 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
352 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
353 "[&#56448;]".encode(self.encoding))
354 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
355 "[]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
357 "[?]".encode(self.encoding))
358
359 bom = "".encode(self.encoding)
360 for before, after in [("\U00010fff", "A"), ("[", "]"),
361 ("A", "\U00010fff")]:
362 before_sequence = before.encode(self.encoding)[len(bom):]
363 after_sequence = after.encode(self.encoding)[len(bom):]
364 test_string = before + "\uDC80" + after
365 test_sequence = (bom + before_sequence +
366 self.ill_formed_sequence + after_sequence)
367 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
368 self.encoding)
369 self.assertEqual(test_string.encode(self.encoding,
370 "surrogatepass"),
371 test_sequence)
372 self.assertEqual(test_sequence.decode(self.encoding,
373 "surrogatepass"),
374 test_string)
375 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
376 before + after)
377 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
378 before + self.ill_formed_sequence_replace + after)
379
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200380class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000381 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200382 if sys.byteorder == 'little':
383 ill_formed_sequence = b"\x80\xdc\x00\x00"
384 else:
385 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000386
387 spamle = (b'\xff\xfe\x00\x00'
388 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
389 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
390 spambe = (b'\x00\x00\xfe\xff'
391 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
392 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
393
394 def test_only_one_bom(self):
395 _,_,reader,writer = codecs.lookup(self.encoding)
396 # encode some stream
397 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200398 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 f.write("spam")
400 f.write("spam")
401 d = s.getvalue()
402 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000403 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 # try to read it back
405 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200406 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000407 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000408
409 def test_badbom(self):
410 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000412 self.assertRaises(UnicodeError, f.read)
413
414 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200415 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416 self.assertRaises(UnicodeError, f.read)
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "", # first byte of BOM read
423 "", # second byte of BOM read
424 "", # third byte of BOM read
425 "", # fourth byte of BOM read => byteorder known
426 "",
427 "",
428 "",
429 "\x00",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00\xff",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff\u0100",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff",
443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 ]
447 )
448
Georg Brandl791f4e12009-09-17 11:41:24 +0000449 def test_handlers(self):
450 self.assertEqual(('\ufffd', 1),
451 codecs.utf_32_decode(b'\x01', 'replace', True))
452 self.assertEqual(('', 1),
453 codecs.utf_32_decode(b'\x01', 'ignore', True))
454
Walter Dörwald41980ca2007-08-16 21:55:45 +0000455 def test_errors(self):
456 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
457 b"\xff", "strict", True)
458
459 def test_decoder_state(self):
460 self.check_state_handling_decode(self.encoding,
461 "spamspam", self.spamle)
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spambe)
464
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000465 def test_issue8941(self):
466 # Issue #8941: insufficient result allocation when decoding into
467 # surrogate pairs on UCS-2 builds.
468 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
469 self.assertEqual('\U00010000' * 1024,
470 codecs.utf_32_decode(encoded_le)[0])
471 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
472 self.assertEqual('\U00010000' * 1024,
473 codecs.utf_32_decode(encoded_be)[0])
474
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200475class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200477 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478
479 def test_partial(self):
480 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200481 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 [
483 "",
484 "",
485 "",
486 "\x00",
487 "\x00",
488 "\x00",
489 "\x00",
490 "\x00\xff",
491 "\x00\xff",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff\u0100",
495 "\x00\xff\u0100",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200499 "\x00\xff\u0100\uffff",
500 "\x00\xff\u0100\uffff",
501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000503 ]
504 )
505
506 def test_simple(self):
507 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
508
509 def test_errors(self):
510 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
511 b"\xff", "strict", True)
512
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000513 def test_issue8941(self):
514 # Issue #8941: insufficient result allocation when decoding into
515 # surrogate pairs on UCS-2 builds.
516 encoded = b'\x00\x00\x01\x00' * 1024
517 self.assertEqual('\U00010000' * 1024,
518 codecs.utf_32_le_decode(encoded)[0])
519
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200520class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200522 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523
524 def test_partial(self):
525 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 [
528 "",
529 "",
530 "",
531 "\x00",
532 "\x00",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff\u0100",
540 "\x00\xff\u0100",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200544 "\x00\xff\u0100\uffff",
545 "\x00\xff\u0100\uffff",
546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000548 ]
549 )
550
551 def test_simple(self):
552 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
553
554 def test_errors(self):
555 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
556 b"\xff", "strict", True)
557
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000558 def test_issue8941(self):
559 # Issue #8941: insufficient result allocation when decoding into
560 # surrogate pairs on UCS-2 builds.
561 encoded = b'\x00\x01\x00\x00' * 1024
562 self.assertEqual('\U00010000' * 1024,
563 codecs.utf_32_be_decode(encoded)[0])
564
565
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200566class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000567 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200568 if sys.byteorder == 'little':
569 ill_formed_sequence = b"\x80\xdc"
570 else:
571 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000572
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000573 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
574 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000575
576 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000579 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200580 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000581 f.write("spam")
582 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000583 d = s.getvalue()
584 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000585 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000586 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000587 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200588 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000589 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000591 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000592 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200593 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000594 self.assertRaises(UnicodeError, f.read)
595
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000596 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200597 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000598 self.assertRaises(UnicodeError, f.read)
599
Walter Dörwald69652032004-09-07 20:24:22 +0000600 def test_partial(self):
601 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200602 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000603 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000604 "", # first byte of BOM read
605 "", # second byte of BOM read => byteorder known
606 "",
607 "\x00",
608 "\x00",
609 "\x00\xff",
610 "\x00\xff",
611 "\x00\xff\u0100",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000618 ]
619 )
620
Georg Brandl791f4e12009-09-17 11:41:24 +0000621 def test_handlers(self):
622 self.assertEqual(('\ufffd', 1),
623 codecs.utf_16_decode(b'\x01', 'replace', True))
624 self.assertEqual(('', 1),
625 codecs.utf_16_decode(b'\x01', 'ignore', True))
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000628 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000629 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630
631 def test_decoder_state(self):
632 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000637 def test_bug691291(self):
638 # Files are always opened in binary mode, even if no binary mode was
639 # specified. This means that no automatic conversion of '\n' is done
640 # on reading and writing.
641 s1 = 'Hello\r\nworld\r\n'
642
643 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200644 self.addCleanup(support.unlink, support.TESTFN)
645 with open(support.TESTFN, 'wb') as fp:
646 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200647 with support.check_warnings(('', DeprecationWarning)):
648 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
649 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000651
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200652class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000653 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000655
656 def test_partial(self):
657 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200658 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000659 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "",
661 "\x00",
662 "\x00",
663 "\x00\xff",
664 "\x00\xff",
665 "\x00\xff\u0100",
666 "\x00\xff\u0100",
667 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200668 "\x00\xff\u0100\uffff",
669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000672 ]
673 )
674
Walter Dörwalde22d3392005-11-17 08:52:34 +0000675 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200676 tests = [
677 (b'\xff', '\ufffd'),
678 (b'A\x00Z', 'A\ufffd'),
679 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
680 (b'\x00\xd8', '\ufffd'),
681 (b'\x00\xd8A', '\ufffd'),
682 (b'\x00\xd8A\x00', '\ufffdA'),
683 (b'\x00\xdcA\x00', '\ufffdA'),
684 ]
685 for raw, expected in tests:
686 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
687 raw, 'strict', True)
688 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000689
Victor Stinner53a9dd72010-12-08 22:25:45 +0000690 def test_nonbmp(self):
691 self.assertEqual("\U00010203".encode(self.encoding),
692 b'\x00\xd8\x03\xde')
693 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
694 "\U00010203")
695
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200696class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000697 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200698 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000699
700 def test_partial(self):
701 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200702 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000703 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000704 "",
705 "\x00",
706 "\x00",
707 "\x00\xff",
708 "\x00\xff",
709 "\x00\xff\u0100",
710 "\x00\xff\u0100",
711 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200712 "\x00\xff\u0100\uffff",
713 "\x00\xff\u0100\uffff",
714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000716 ]
717 )
718
Walter Dörwalde22d3392005-11-17 08:52:34 +0000719 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200720 tests = [
721 (b'\xff', '\ufffd'),
722 (b'\x00A\xff', 'A\ufffd'),
723 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
724 (b'\xd8\x00', '\ufffd'),
725 (b'\xd8\x00\xdc', '\ufffd'),
726 (b'\xd8\x00\x00A', '\ufffdA'),
727 (b'\xdc\x00\x00A', '\ufffdA'),
728 ]
729 for raw, expected in tests:
730 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
731 raw, 'strict', True)
732 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000733
Victor Stinner53a9dd72010-12-08 22:25:45 +0000734 def test_nonbmp(self):
735 self.assertEqual("\U00010203".encode(self.encoding),
736 b'\xd8\x00\xde\x03')
737 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
738 "\U00010203")
739
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200740class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000741 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200742 ill_formed_sequence = b"\xed\xb2\x80"
743 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000744
745 def test_partial(self):
746 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200747 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000749 "\x00",
750 "\x00",
751 "\x00\xff",
752 "\x00\xff",
753 "\x00\xff\u07ff",
754 "\x00\xff\u07ff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff\u0800",
757 "\x00\xff\u07ff\u0800",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200760 "\x00\xff\u07ff\u0800\uffff",
761 "\x00\xff\u07ff\u0800\uffff",
762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000764 ]
765 )
766
Walter Dörwald3abcb012007-04-16 22:10:50 +0000767 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 self.check_state_handling_decode(self.encoding,
770 u, u.encode(self.encoding))
771
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000772 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200773 super().test_lone_surrogates()
774 # not sure if this is making sense for
775 # UTF-16 and UTF-32
776 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000777 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000779 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000780 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
781 b"abc\xed\xa0\x80def")
782 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
783 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
785 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
786 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
787 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000788 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700789 with self.assertRaises(UnicodeDecodeError):
790 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000793
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200794@unittest.skipUnless(sys.platform == 'win32',
795 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200796class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200797 encoding = "cp65001"
798
799 def test_encode(self):
800 tests = [
801 ('abc', 'strict', b'abc'),
802 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
803 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
804 ]
805 if VISTA_OR_LATER:
806 tests.extend((
807 ('\udc80', 'strict', None),
808 ('\udc80', 'ignore', b''),
809 ('\udc80', 'replace', b'?'),
810 ('\udc80', 'backslashreplace', b'\\udc80'),
811 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
812 ))
813 else:
814 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
815 for text, errors, expected in tests:
816 if expected is not None:
817 try:
818 encoded = text.encode('cp65001', errors)
819 except UnicodeEncodeError as err:
820 self.fail('Unable to encode %a to cp65001 with '
821 'errors=%r: %s' % (text, errors, err))
822 self.assertEqual(encoded, expected,
823 '%a.encode("cp65001", %r)=%a != %a'
824 % (text, errors, encoded, expected))
825 else:
826 self.assertRaises(UnicodeEncodeError,
827 text.encode, "cp65001", errors)
828
829 def test_decode(self):
830 tests = [
831 (b'abc', 'strict', 'abc'),
832 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
833 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
834 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
835 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
836 # invalid bytes
837 (b'[\xff]', 'strict', None),
838 (b'[\xff]', 'ignore', '[]'),
839 (b'[\xff]', 'replace', '[\ufffd]'),
840 (b'[\xff]', 'surrogateescape', '[\udcff]'),
841 ]
842 if VISTA_OR_LATER:
843 tests.extend((
844 (b'[\xed\xb2\x80]', 'strict', None),
845 (b'[\xed\xb2\x80]', 'ignore', '[]'),
846 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
847 ))
848 else:
849 tests.extend((
850 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
851 ))
852 for raw, errors, expected in tests:
853 if expected is not None:
854 try:
855 decoded = raw.decode('cp65001', errors)
856 except UnicodeDecodeError as err:
857 self.fail('Unable to decode %a from cp65001 with '
858 'errors=%r: %s' % (raw, errors, err))
859 self.assertEqual(decoded, expected,
860 '%a.decode("cp65001", %r)=%a != %a'
861 % (raw, errors, decoded, expected))
862 else:
863 self.assertRaises(UnicodeDecodeError,
864 raw.decode, 'cp65001', errors)
865
866 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
867 def test_lone_surrogates(self):
868 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
869 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
870 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
871 b'[\\udc80]')
872 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
873 b'[&#56448;]')
874 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
875 b'[\x80]')
876 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
877 b'[]')
878 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
879 b'[?]')
880
881 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
882 def test_surrogatepass_handler(self):
883 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
884 b"abc\xed\xa0\x80def")
885 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
886 "abc\ud800def")
887 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
888 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
889 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
890 "\U00010fff\uD800")
891 self.assertTrue(codecs.lookup_error("surrogatepass"))
892
Victor Stinner3633ce32014-02-09 13:11:53 +0100893 def test_readline(self):
894 self.skipTest("issue #20571: code page 65001 codec does not "
895 "support partial decoder yet")
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200896
897
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200898class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000899 encoding = "utf-7"
900
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000901 def test_partial(self):
902 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200903 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000904 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200905 'a',
906 'a',
907 'a+',
908 'a+-',
909 'a+-b',
910 'a+-b',
911 'a+-b',
912 'a+-b',
913 'a+-b',
914 'a+-b\x00',
915 'a+-b\x00c',
916 'a+-b\x00c',
917 'a+-b\x00c',
918 'a+-b\x00c',
919 'a+-b\x00c',
920 'a+-b\x00c\x80',
921 'a+-b\x00c\x80d',
922 'a+-b\x00c\x80d',
923 'a+-b\x00c\x80d',
924 'a+-b\x00c\x80d',
925 'a+-b\x00c\x80d',
926 'a+-b\x00c\x80d\u0100',
927 'a+-b\x00c\x80d\u0100e',
928 'a+-b\x00c\x80d\u0100e',
929 'a+-b\x00c\x80d\u0100e',
930 'a+-b\x00c\x80d\u0100e',
931 'a+-b\x00c\x80d\u0100e',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e\U00010000',
936 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000937 ]
938 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300940 def test_errors(self):
941 tests = [
942 (b'a\xffb', 'a\ufffdb'),
943 (b'a+IK', 'a\ufffd'),
944 (b'a+IK-b', 'a\ufffdb'),
945 (b'a+IK,b', 'a\ufffdb'),
946 (b'a+IKx', 'a\u20ac\ufffd'),
947 (b'a+IKx-b', 'a\u20ac\ufffdb'),
948 (b'a+IKwgr', 'a\u20ac\ufffd'),
949 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
950 (b'a+IKwgr,', 'a\u20ac\ufffd'),
951 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
952 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
953 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
954 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
955 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
956 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
957 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
958 ]
959 for raw, expected in tests:
960 with self.subTest(raw=raw):
961 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
962 raw, 'strict', True)
963 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
964
965 def test_nonbmp(self):
966 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
967 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
968 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
969
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200970 test_lone_surrogates = None
971
972
Walter Dörwalde22d3392005-11-17 08:52:34 +0000973class UTF16ExTest(unittest.TestCase):
974
975 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000976 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000977
978 def test_bad_args(self):
979 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
980
981class ReadBufferTest(unittest.TestCase):
982
983 def test_array(self):
984 import array
985 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000986 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000987 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000988 )
989
990 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000991 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000992
993 def test_bad_args(self):
994 self.assertRaises(TypeError, codecs.readbuffer_encode)
995 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
996
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200997class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000998 encoding = "utf-8-sig"
999
1000 def test_partial(self):
1001 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001002 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001003 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001004 "",
1005 "",
1006 "", # First BOM has been read and skipped
1007 "",
1008 "",
1009 "\ufeff", # Second BOM has been read and emitted
1010 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001011 "\ufeff\x00", # First byte of encoded "\xff" read
1012 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1013 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1014 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001015 "\ufeff\x00\xff\u07ff",
1016 "\ufeff\x00\xff\u07ff",
1017 "\ufeff\x00\xff\u07ff\u0800",
1018 "\ufeff\x00\xff\u07ff\u0800",
1019 "\ufeff\x00\xff\u07ff\u0800",
1020 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001021 "\ufeff\x00\xff\u07ff\u0800\uffff",
1022 "\ufeff\x00\xff\u07ff\u0800\uffff",
1023 "\ufeff\x00\xff\u07ff\u0800\uffff",
1024 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001025 ]
1026 )
1027
Thomas Wouters89f507f2006-12-13 04:49:30 +00001028 def test_bug1601501(self):
1029 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001030 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001031
Walter Dörwald3abcb012007-04-16 22:10:50 +00001032 def test_bom(self):
1033 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001035 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1036
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001037 def test_stream_bom(self):
1038 unistring = "ABC\u00A1\u2200XYZ"
1039 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1040
1041 reader = codecs.getreader("utf-8-sig")
1042 for sizehint in [None] + list(range(1, 11)) + \
1043 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001044 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001045 ostream = io.StringIO()
1046 while 1:
1047 if sizehint is not None:
1048 data = istream.read(sizehint)
1049 else:
1050 data = istream.read()
1051
1052 if not data:
1053 break
1054 ostream.write(data)
1055
1056 got = ostream.getvalue()
1057 self.assertEqual(got, unistring)
1058
1059 def test_stream_bare(self):
1060 unistring = "ABC\u00A1\u2200XYZ"
1061 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1062
1063 reader = codecs.getreader("utf-8-sig")
1064 for sizehint in [None] + list(range(1, 11)) + \
1065 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001066 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001067 ostream = io.StringIO()
1068 while 1:
1069 if sizehint is not None:
1070 data = istream.read(sizehint)
1071 else:
1072 data = istream.read()
1073
1074 if not data:
1075 break
1076 ostream.write(data)
1077
1078 got = ostream.getvalue()
1079 self.assertEqual(got, unistring)
1080
1081class EscapeDecodeTest(unittest.TestCase):
1082 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001083 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001084
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001085 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001086 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001087 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001088 b = bytes([b])
1089 if b != b'\\':
1090 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001091
1092 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001093 decode = codecs.escape_decode
1094 check = coding_checker(self, decode)
1095 check(b"[\\\n]", b"[]")
1096 check(br'[\"]', b'["]')
1097 check(br"[\']", b"[']")
1098 check(br"[\\]", br"[\]")
1099 check(br"[\a]", b"[\x07]")
1100 check(br"[\b]", b"[\x08]")
1101 check(br"[\t]", b"[\x09]")
1102 check(br"[\n]", b"[\x0a]")
1103 check(br"[\v]", b"[\x0b]")
1104 check(br"[\f]", b"[\x0c]")
1105 check(br"[\r]", b"[\x0d]")
1106 check(br"[\7]", b"[\x07]")
1107 check(br"[\8]", br"[\8]")
1108 check(br"[\78]", b"[\x078]")
1109 check(br"[\41]", b"[!]")
1110 check(br"[\418]", b"[!8]")
1111 check(br"[\101]", b"[A]")
1112 check(br"[\1010]", b"[A0]")
1113 check(br"[\501]", b"[A]")
1114 check(br"[\x41]", b"[A]")
1115 check(br"[\X41]", br"[\X41]")
1116 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001117 for b in range(256):
1118 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001119 b = bytes([b])
1120 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001121
1122 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001123 decode = codecs.escape_decode
1124 self.assertRaises(ValueError, decode, br"\x")
1125 self.assertRaises(ValueError, decode, br"[\x]")
1126 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1127 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1128 self.assertRaises(ValueError, decode, br"\x0")
1129 self.assertRaises(ValueError, decode, br"[\x0]")
1130 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1131 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001132
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001133class RecodingTest(unittest.TestCase):
1134 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001135 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001136 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001137 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001138 f2.close()
1139 # Python used to crash on this at exit because of a refcount
1140 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001141
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142# From RFC 3492
1143punycode_testcases = [
1144 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001145 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1146 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001147 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001150 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001151 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001153 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001155 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1156 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1157 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001158 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1161 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1162 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001163 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001165 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001166 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1167 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1168 "\u0939\u0948\u0902",
1169 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170
1171 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001173 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1174 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175
1176 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001177 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1178 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1179 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001180 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1181 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001182
1183 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001184 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1185 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1186 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1187 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001188 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189
1190 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001191 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1192 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1193 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1194 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1195 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001196 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197
1198 # (K) Vietnamese:
1199 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1200 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001201 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1202 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1203 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1204 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001205 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001206
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001208 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001209 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001210
Martin v. Löwis2548c732003-04-18 10:39:54 +00001211 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001212 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1213 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1214 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001215 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216
1217 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001218 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1219 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1220 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001221 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222
1223 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001224 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001225 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001226
1227 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001228 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1229 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231
1232 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001233 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001234 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235
1236 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001238 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239
1240 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001241 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1242 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001243 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001244 ]
1245
1246for i in punycode_testcases:
1247 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001248 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249
1250class PunycodeTest(unittest.TestCase):
1251 def test_encode(self):
1252 for uni, puny in punycode_testcases:
1253 # Need to convert both strings to lower case, since
1254 # some of the extended encodings use upper case, but our
1255 # code produces only lower case. Converting just puny to
1256 # lower is also insufficient, since some of the input characters
1257 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001258 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001259 str(uni.encode("punycode"), "ascii").lower(),
1260 str(puny, "ascii").lower()
1261 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001262
1263 def test_decode(self):
1264 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001265 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001266 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001267 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001268
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001269class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001270 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001271 def test_bug1251300(self):
1272 # Decoding with unicode_internal used to not correctly handle "code
1273 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001274 ok = [
1275 (b"\x00\x10\xff\xff", "\U0010ffff"),
1276 (b"\x00\x00\x01\x01", "\U00000101"),
1277 (b"", ""),
1278 ]
1279 not_ok = [
1280 b"\x7f\xff\xff\xff",
1281 b"\x80\x00\x00\x00",
1282 b"\x81\x00\x00\x00",
1283 b"\x00",
1284 b"\x00\x00\x00\x00\x00",
1285 ]
1286 for internal, uni in ok:
1287 if sys.byteorder == "little":
1288 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001289 with support.check_warnings():
1290 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001291 for internal in not_ok:
1292 if sys.byteorder == "little":
1293 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001294 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001295 'deprecated', DeprecationWarning)):
1296 self.assertRaises(UnicodeDecodeError, internal.decode,
1297 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001298 if sys.byteorder == "little":
1299 invalid = b"\x00\x00\x11\x00"
1300 else:
1301 invalid = b"\x00\x11\x00\x00"
1302 with support.check_warnings():
1303 self.assertRaises(UnicodeDecodeError,
1304 invalid.decode, "unicode_internal")
1305 with support.check_warnings():
1306 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1307 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001308
Victor Stinner182d90d2011-09-29 19:53:55 +02001309 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001310 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001311 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001312 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001313 'deprecated', DeprecationWarning)):
1314 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001315 except UnicodeDecodeError as ex:
1316 self.assertEqual("unicode_internal", ex.encoding)
1317 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1318 self.assertEqual(4, ex.start)
1319 self.assertEqual(8, ex.end)
1320 else:
1321 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001322
Victor Stinner182d90d2011-09-29 19:53:55 +02001323 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001324 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001325 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1326 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001327 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001328 'deprecated', DeprecationWarning)):
1329 ab = "ab".encode("unicode_internal").decode()
1330 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1331 "ascii"),
1332 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001333 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001334
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001335 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001336 with support.check_warnings(('unicode_internal codec has been '
1337 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001338 # Issue 3739
1339 encoder = codecs.getencoder("unicode_internal")
1340 self.assertEqual(encoder("a")[1], 1)
1341 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1342
1343 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001344
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1346nameprep_tests = [
1347 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001348 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1349 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1350 b'\xb8\x8f\xef\xbb\xbf',
1351 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'CAFE',
1354 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.3 Case folding 8bit U+00DF (german sharp s).
1356 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001357 (b'\xc3\x9f',
1358 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xc4\xb0',
1361 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\xc5\x83\xcd\xba',
1364 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1366 # XXX: skip this as it fails in UCS-2 mode
1367 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1368 # 'telc\xe2\x88\x95kg\xcf\x83'),
1369 (None, None),
1370 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1372 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xe1\xbe\xb7',
1375 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.9 Self-reverting case folding U+01F0 and normalization.
1377 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001378 (b'\xc7\xb0',
1379 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001380 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xce\x90',
1382 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001383 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xce\xb0',
1385 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xe1\xba\x96',
1388 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xe1\xbd\x96',
1391 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b' ',
1394 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b'\xc2\xa0',
1397 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 None),
1401 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xe2\x80\x80',
1403 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xe2\x80\x8b',
1406 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'\xe3\x80\x80',
1409 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\x10\x7f',
1412 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 None),
1416 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 None),
1419 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xef\xbb\xbf',
1421 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001422 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 None),
1425 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 None),
1428 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 None),
1431 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001432 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 None),
1434 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001435 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 None),
1437 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 None),
1440 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 None),
1443 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 None),
1446 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 None),
1449 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xcd\x81',
1451 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001452 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454 None),
1455 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 None),
1458 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 None),
1464 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 None),
1467 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 None),
1470 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'foo\xef\xb9\xb6bar',
1472 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 None),
1476 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xd8\xa71\xd8\xa8',
1478 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001480 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001481 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001482 # None),
1483 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 # 3.44 Larger test (shrinking).
1485 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1487 b'\xaa\xce\xb0\xe2\x80\x80',
1488 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 # 3.45 Larger test (expanding).
1490 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1492 b'\x80',
1493 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1494 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1495 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001496 ]
1497
1498
1499class NameprepTest(unittest.TestCase):
1500 def test_nameprep(self):
1501 from encodings.idna import nameprep
1502 for pos, (orig, prepped) in enumerate(nameprep_tests):
1503 if orig is None:
1504 # Skipped
1505 continue
1506 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001507 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001508 if prepped is None:
1509 # Input contains prohibited characters
1510 self.assertRaises(UnicodeError, nameprep, orig)
1511 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001512 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001514 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001515 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001516 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001517
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001518class IDNACodecTest(unittest.TestCase):
1519 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001520 self.assertEqual(str(b"python.org", "idna"), "python.org")
1521 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1522 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1523 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001524
1525 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001526 self.assertEqual("python.org".encode("idna"), b"python.org")
1527 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1528 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1529 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001530
Martin v. Löwis8b595142005-08-25 11:03:38 +00001531 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001532 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001533 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001534 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001535
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001536 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001538 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539 "python.org"
1540 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001542 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001543 "python.org."
1544 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001546 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001547 "pyth\xf6n.org."
1548 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001550 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 "pyth\xf6n.org."
1552 )
1553
1554 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001555 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1556 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1557 self.assertEqual(decoder.decode(b"rg"), "")
1558 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001559
1560 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1562 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1563 self.assertEqual(decoder.decode(b"rg."), "org.")
1564 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565
1566 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001567 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 b"".join(codecs.iterencode("python.org", "idna")),
1569 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001570 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001571 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001572 b"".join(codecs.iterencode("python.org.", "idna")),
1573 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001574 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1577 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001578 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001580 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1581 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001582 )
1583
1584 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001585 self.assertEqual(encoder.encode("\xe4x"), b"")
1586 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1587 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001588
1589 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001590 self.assertEqual(encoder.encode("\xe4x"), b"")
1591 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1592 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001593
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001594class CodecsModuleTest(unittest.TestCase):
1595
1596 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001597 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1598 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001599 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001600 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001601 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001602
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001603 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001604 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1605 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001606 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001607 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001608 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001609 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001610
1611 def test_register(self):
1612 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001613 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001614
1615 def test_lookup(self):
1616 self.assertRaises(TypeError, codecs.lookup)
1617 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001618 self.assertRaises(LookupError, codecs.lookup, " ")
1619
1620 def test_getencoder(self):
1621 self.assertRaises(TypeError, codecs.getencoder)
1622 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1623
1624 def test_getdecoder(self):
1625 self.assertRaises(TypeError, codecs.getdecoder)
1626 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1627
1628 def test_getreader(self):
1629 self.assertRaises(TypeError, codecs.getreader)
1630 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1631
1632 def test_getwriter(self):
1633 self.assertRaises(TypeError, codecs.getwriter)
1634 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001635
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001636 def test_lookup_issue1813(self):
1637 # Issue #1813: under Turkish locales, lookup of some codecs failed
1638 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001639 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001640 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1641 try:
1642 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1643 except locale.Error:
1644 # Unsupported locale on this system
1645 self.skipTest('test needs Turkish locale')
1646 c = codecs.lookup('ASCII')
1647 self.assertEqual(c.name, 'ascii')
1648
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001649class StreamReaderTest(unittest.TestCase):
1650
1651 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001652 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001653 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001654
1655 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001656 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001657 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001658
Thomas Wouters89f507f2006-12-13 04:49:30 +00001659class EncodedFileTest(unittest.TestCase):
1660
1661 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001662 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001663 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001664 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001665
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001666 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001667 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001668 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001669 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001670
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001671all_unicode_encodings = [
1672 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001673 "big5",
1674 "big5hkscs",
1675 "charmap",
1676 "cp037",
1677 "cp1006",
1678 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001679 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001680 "cp1140",
1681 "cp1250",
1682 "cp1251",
1683 "cp1252",
1684 "cp1253",
1685 "cp1254",
1686 "cp1255",
1687 "cp1256",
1688 "cp1257",
1689 "cp1258",
1690 "cp424",
1691 "cp437",
1692 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001693 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001694 "cp737",
1695 "cp775",
1696 "cp850",
1697 "cp852",
1698 "cp855",
1699 "cp856",
1700 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001701 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001702 "cp860",
1703 "cp861",
1704 "cp862",
1705 "cp863",
1706 "cp864",
1707 "cp865",
1708 "cp866",
1709 "cp869",
1710 "cp874",
1711 "cp875",
1712 "cp932",
1713 "cp949",
1714 "cp950",
1715 "euc_jis_2004",
1716 "euc_jisx0213",
1717 "euc_jp",
1718 "euc_kr",
1719 "gb18030",
1720 "gb2312",
1721 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001722 "hp_roman8",
1723 "hz",
1724 "idna",
1725 "iso2022_jp",
1726 "iso2022_jp_1",
1727 "iso2022_jp_2",
1728 "iso2022_jp_2004",
1729 "iso2022_jp_3",
1730 "iso2022_jp_ext",
1731 "iso2022_kr",
1732 "iso8859_1",
1733 "iso8859_10",
1734 "iso8859_11",
1735 "iso8859_13",
1736 "iso8859_14",
1737 "iso8859_15",
1738 "iso8859_16",
1739 "iso8859_2",
1740 "iso8859_3",
1741 "iso8859_4",
1742 "iso8859_5",
1743 "iso8859_6",
1744 "iso8859_7",
1745 "iso8859_8",
1746 "iso8859_9",
1747 "johab",
1748 "koi8_r",
1749 "koi8_u",
1750 "latin_1",
1751 "mac_cyrillic",
1752 "mac_greek",
1753 "mac_iceland",
1754 "mac_latin2",
1755 "mac_roman",
1756 "mac_turkish",
1757 "palmos",
1758 "ptcp154",
1759 "punycode",
1760 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001761 "shift_jis",
1762 "shift_jis_2004",
1763 "shift_jisx0213",
1764 "tis_620",
1765 "unicode_escape",
1766 "unicode_internal",
1767 "utf_16",
1768 "utf_16_be",
1769 "utf_16_le",
1770 "utf_7",
1771 "utf_8",
1772]
1773
1774if hasattr(codecs, "mbcs_encode"):
1775 all_unicode_encodings.append("mbcs")
1776
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001777# The following encoding is not tested, because it's not supposed
1778# to work:
1779# "undefined"
1780
1781# The following encodings don't work in stateful mode
1782broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001783 "punycode",
1784 "unicode_internal"
1785]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001786broken_incremental_coders = broken_unicode_with_streams + [
1787 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001788]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001789
Walter Dörwald3abcb012007-04-16 22:10:50 +00001790class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001791 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001792 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001793 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001794 name = codecs.lookup(encoding).name
1795 if encoding.endswith("_codec"):
1796 name += "_codec"
1797 elif encoding == "latin_1":
1798 name = "latin_1"
1799 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001800
Ezio Melottiadc417c2011-11-17 12:23:34 +02001801 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001802 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001803 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001804 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001805 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001806 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001807
1808 if encoding not in broken_unicode_with_streams:
1809 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001810 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001811 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001812 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001813 for c in s:
1814 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001815 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001816 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001817 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001818 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001819 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001820 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001821 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001822 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001823 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001824 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001825
Thomas Wouters89f507f2006-12-13 04:49:30 +00001826 if encoding not in broken_incremental_coders:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001827 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001828 try:
1829 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001830 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001831 pass
1832 else:
1833 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001834 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001835 for c in s:
1836 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001837 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001838 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001839 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001840 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001841 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001842 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001843 self.assertEqual(decodedresult, s,
1844 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001845
1846 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001847 result = "".join(codecs.iterdecode(
1848 codecs.iterencode(s, encoding), encoding))
1849 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001850
1851 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001852 result = "".join(codecs.iterdecode(
1853 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001854 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001855
Victor Stinner554f3f02010-06-16 23:33:54 +00001856 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001857 # check incremental decoder/encoder with errors argument
1858 try:
1859 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001860 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001861 pass
1862 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001863 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001864 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001865 decodedresult = "".join(decoder.decode(bytes([c]))
1866 for c in encodedresult)
1867 self.assertEqual(decodedresult, s,
1868 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001869
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001870 @support.cpython_only
1871 def test_basics_capi(self):
1872 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1873 s = "abc123" # all codecs should be able to encode these
1874 for encoding in all_unicode_encodings:
1875 if encoding not in broken_incremental_coders:
1876 # check incremental decoder/encoder (fetched via the C API)
1877 try:
1878 cencoder = codec_incrementalencoder(encoding)
1879 except LookupError: # no IncrementalEncoder
1880 pass
1881 else:
1882 # check C API
1883 encodedresult = b""
1884 for c in s:
1885 encodedresult += cencoder.encode(c)
1886 encodedresult += cencoder.encode("", True)
1887 cdecoder = codec_incrementaldecoder(encoding)
1888 decodedresult = ""
1889 for c in encodedresult:
1890 decodedresult += cdecoder.decode(bytes([c]))
1891 decodedresult += cdecoder.decode(b"", True)
1892 self.assertEqual(decodedresult, s,
1893 "encoding=%r" % encoding)
1894
1895 if encoding not in ("idna", "mbcs"):
1896 # check incremental decoder/encoder with errors argument
1897 try:
1898 cencoder = codec_incrementalencoder(encoding, "ignore")
1899 except LookupError: # no IncrementalEncoder
1900 pass
1901 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001902 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001903 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1904 decodedresult = "".join(cdecoder.decode(bytes([c]))
1905 for c in encodedresult)
1906 self.assertEqual(decodedresult, s,
1907 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001908
Walter Dörwald729c31f2005-03-14 19:06:30 +00001909 def test_seek(self):
1910 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001911 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001912 for encoding in all_unicode_encodings:
1913 if encoding == "idna": # FIXME: See SF bug #1163178
1914 continue
1915 if encoding in broken_unicode_with_streams:
1916 continue
Victor Stinner05010702011-05-27 16:50:40 +02001917 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001918 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001919 # Test that calling seek resets the internal codec state and buffers
1920 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001921 data = reader.read()
1922 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001923
Walter Dörwalde22d3392005-11-17 08:52:34 +00001924 def test_bad_decode_args(self):
1925 for encoding in all_unicode_encodings:
1926 decoder = codecs.getdecoder(encoding)
1927 self.assertRaises(TypeError, decoder)
1928 if encoding not in ("idna", "punycode"):
1929 self.assertRaises(TypeError, decoder, 42)
1930
1931 def test_bad_encode_args(self):
1932 for encoding in all_unicode_encodings:
1933 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001934 with support.check_warnings():
1935 # unicode-internal has been deprecated
1936 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001937
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001938 def test_encoding_map_type_initialized(self):
1939 from encodings import cp1140
1940 # This used to crash, we are only verifying there's no crash.
1941 table_type = type(cp1140.encoding_table)
1942 self.assertEqual(table_type, table_type)
1943
Walter Dörwald3abcb012007-04-16 22:10:50 +00001944 def test_decoder_state(self):
1945 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001946 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001947 for encoding in all_unicode_encodings:
1948 if encoding not in broken_incremental_coders:
1949 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1950 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1951
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001952class CharmapTest(unittest.TestCase):
1953 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001954 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001955 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001956 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001957 )
1958
Ezio Melottib3aedd42010-11-20 19:04:17 +00001959 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001960 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1961 ("\U0010FFFFbc", 3)
1962 )
1963
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001964 self.assertRaises(UnicodeDecodeError,
1965 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1966 )
1967
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001968 self.assertRaises(UnicodeDecodeError,
1969 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1970 )
1971
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001972 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001973 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001974 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001975 )
1976
Ezio Melottib3aedd42010-11-20 19:04:17 +00001977 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001978 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001979 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001980 )
1981
Ezio Melottib3aedd42010-11-20 19:04:17 +00001982 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001983 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001984 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001985 )
1986
Ezio Melottib3aedd42010-11-20 19:04:17 +00001987 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001988 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001989 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001990 )
1991
Guido van Rossum805365e2007-05-07 22:24:25 +00001992 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001993 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001994 codecs.charmap_decode(allbytes, "ignore", ""),
1995 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001996 )
1997
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001998 def test_decode_with_int2str_map(self):
1999 self.assertEqual(
2000 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2001 {0: 'a', 1: 'b', 2: 'c'}),
2002 ("abc", 3)
2003 )
2004
2005 self.assertEqual(
2006 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2007 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2008 ("AaBbCc", 3)
2009 )
2010
2011 self.assertEqual(
2012 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2013 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2014 ("\U0010FFFFbc", 3)
2015 )
2016
2017 self.assertEqual(
2018 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2019 {0: 'a', 1: 'b', 2: ''}),
2020 ("ab", 3)
2021 )
2022
2023 self.assertRaises(UnicodeDecodeError,
2024 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2025 {0: 'a', 1: 'b'}
2026 )
2027
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002028 self.assertRaises(UnicodeDecodeError,
2029 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2030 {0: 'a', 1: 'b', 2: None}
2031 )
2032
2033 # Issue #14850
2034 self.assertRaises(UnicodeDecodeError,
2035 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2036 {0: 'a', 1: 'b', 2: '\ufffe'}
2037 )
2038
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002039 self.assertEqual(
2040 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2041 {0: 'a', 1: 'b'}),
2042 ("ab\ufffd", 3)
2043 )
2044
2045 self.assertEqual(
2046 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2047 {0: 'a', 1: 'b', 2: None}),
2048 ("ab\ufffd", 3)
2049 )
2050
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002051 # Issue #14850
2052 self.assertEqual(
2053 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2054 {0: 'a', 1: 'b', 2: '\ufffe'}),
2055 ("ab\ufffd", 3)
2056 )
2057
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002058 self.assertEqual(
2059 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2060 {0: 'a', 1: 'b'}),
2061 ("ab", 3)
2062 )
2063
2064 self.assertEqual(
2065 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2066 {0: 'a', 1: 'b', 2: None}),
2067 ("ab", 3)
2068 )
2069
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002070 # Issue #14850
2071 self.assertEqual(
2072 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2073 {0: 'a', 1: 'b', 2: '\ufffe'}),
2074 ("ab", 3)
2075 )
2076
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002077 allbytes = bytes(range(256))
2078 self.assertEqual(
2079 codecs.charmap_decode(allbytes, "ignore", {}),
2080 ("", len(allbytes))
2081 )
2082
2083 def test_decode_with_int2int_map(self):
2084 a = ord('a')
2085 b = ord('b')
2086 c = ord('c')
2087
2088 self.assertEqual(
2089 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2090 {0: a, 1: b, 2: c}),
2091 ("abc", 3)
2092 )
2093
2094 # Issue #15379
2095 self.assertEqual(
2096 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2097 {0: 0x10FFFF, 1: b, 2: c}),
2098 ("\U0010FFFFbc", 3)
2099 )
2100
Antoine Pitroua1f76552012-09-23 20:00:04 +02002101 self.assertEqual(
2102 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2103 {0: sys.maxunicode, 1: b, 2: c}),
2104 (chr(sys.maxunicode) + "bc", 3)
2105 )
2106
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002107 self.assertRaises(TypeError,
2108 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002109 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002110 )
2111
2112 self.assertRaises(UnicodeDecodeError,
2113 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2114 {0: a, 1: b},
2115 )
2116
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002117 self.assertRaises(UnicodeDecodeError,
2118 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2119 {0: a, 1: b, 2: 0xFFFE},
2120 )
2121
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002122 self.assertEqual(
2123 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2124 {0: a, 1: b}),
2125 ("ab\ufffd", 3)
2126 )
2127
2128 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002129 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2130 {0: a, 1: b, 2: 0xFFFE}),
2131 ("ab\ufffd", 3)
2132 )
2133
2134 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002135 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2136 {0: a, 1: b}),
2137 ("ab", 3)
2138 )
2139
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002140 self.assertEqual(
2141 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2142 {0: a, 1: b, 2: 0xFFFE}),
2143 ("ab", 3)
2144 )
2145
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002146
Thomas Wouters89f507f2006-12-13 04:49:30 +00002147class WithStmtTest(unittest.TestCase):
2148 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002149 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002150 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2151 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002152
2153 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002154 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002155 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002156 with codecs.StreamReaderWriter(f, info.streamreader,
2157 info.streamwriter, 'strict') as srw:
2158 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002159
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002160class TypesTest(unittest.TestCase):
2161 def test_decode_unicode(self):
2162 # Most decoders don't accept unicode input
2163 decoders = [
2164 codecs.utf_7_decode,
2165 codecs.utf_8_decode,
2166 codecs.utf_16_le_decode,
2167 codecs.utf_16_be_decode,
2168 codecs.utf_16_ex_decode,
2169 codecs.utf_32_decode,
2170 codecs.utf_32_le_decode,
2171 codecs.utf_32_be_decode,
2172 codecs.utf_32_ex_decode,
2173 codecs.latin_1_decode,
2174 codecs.ascii_decode,
2175 codecs.charmap_decode,
2176 ]
2177 if hasattr(codecs, "mbcs_decode"):
2178 decoders.append(codecs.mbcs_decode)
2179 for decoder in decoders:
2180 self.assertRaises(TypeError, decoder, "xxx")
2181
2182 def test_unicode_escape(self):
2183 # Escape-decoding an unicode string is supported ang gives the same
2184 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002185 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2186 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2187 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2188 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002189
Victor Stinnere3b47152011-12-09 20:49:49 +01002190 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2191 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2192
2193 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2194 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2195
Serhiy Storchakad6793772013-01-29 10:20:44 +02002196
2197class UnicodeEscapeTest(unittest.TestCase):
2198 def test_empty(self):
2199 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2200 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2201
2202 def test_raw_encode(self):
2203 encode = codecs.unicode_escape_encode
2204 for b in range(32, 127):
2205 if b != b'\\'[0]:
2206 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2207
2208 def test_raw_decode(self):
2209 decode = codecs.unicode_escape_decode
2210 for b in range(256):
2211 if b != b'\\'[0]:
2212 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2213
2214 def test_escape_encode(self):
2215 encode = codecs.unicode_escape_encode
2216 check = coding_checker(self, encode)
2217 check('\t', br'\t')
2218 check('\n', br'\n')
2219 check('\r', br'\r')
2220 check('\\', br'\\')
2221 for b in range(32):
2222 if chr(b) not in '\t\n\r':
2223 check(chr(b), ('\\x%02x' % b).encode())
2224 for b in range(127, 256):
2225 check(chr(b), ('\\x%02x' % b).encode())
2226 check('\u20ac', br'\u20ac')
2227 check('\U0001d120', br'\U0001d120')
2228
2229 def test_escape_decode(self):
2230 decode = codecs.unicode_escape_decode
2231 check = coding_checker(self, decode)
2232 check(b"[\\\n]", "[]")
2233 check(br'[\"]', '["]')
2234 check(br"[\']", "[']")
2235 check(br"[\\]", r"[\]")
2236 check(br"[\a]", "[\x07]")
2237 check(br"[\b]", "[\x08]")
2238 check(br"[\t]", "[\x09]")
2239 check(br"[\n]", "[\x0a]")
2240 check(br"[\v]", "[\x0b]")
2241 check(br"[\f]", "[\x0c]")
2242 check(br"[\r]", "[\x0d]")
2243 check(br"[\7]", "[\x07]")
2244 check(br"[\8]", r"[\8]")
2245 check(br"[\78]", "[\x078]")
2246 check(br"[\41]", "[!]")
2247 check(br"[\418]", "[!8]")
2248 check(br"[\101]", "[A]")
2249 check(br"[\1010]", "[A0]")
2250 check(br"[\x41]", "[A]")
2251 check(br"[\x410]", "[A0]")
2252 check(br"\u20ac", "\u20ac")
2253 check(br"\U0001d120", "\U0001d120")
2254 for b in range(256):
2255 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2256 check(b'\\' + bytes([b]), '\\' + chr(b))
2257
2258 def test_decode_errors(self):
2259 decode = codecs.unicode_escape_decode
2260 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2261 for i in range(d):
2262 self.assertRaises(UnicodeDecodeError, decode,
2263 b"\\" + c + b"0"*i)
2264 self.assertRaises(UnicodeDecodeError, decode,
2265 b"[\\" + c + b"0"*i + b"]")
2266 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2267 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2268 self.assertEqual(decode(data, "replace"),
2269 ("[\ufffd]\ufffd", len(data)))
2270 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2271 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2272 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2273
2274
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002275class RawUnicodeEscapeTest(unittest.TestCase):
2276 def test_empty(self):
2277 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2278 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2279
2280 def test_raw_encode(self):
2281 encode = codecs.raw_unicode_escape_encode
2282 for b in range(256):
2283 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2284
2285 def test_raw_decode(self):
2286 decode = codecs.raw_unicode_escape_decode
2287 for b in range(256):
2288 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2289
2290 def test_escape_encode(self):
2291 encode = codecs.raw_unicode_escape_encode
2292 check = coding_checker(self, encode)
2293 for b in range(256):
2294 if b not in b'uU':
2295 check('\\' + chr(b), b'\\' + bytes([b]))
2296 check('\u20ac', br'\u20ac')
2297 check('\U0001d120', br'\U0001d120')
2298
2299 def test_escape_decode(self):
2300 decode = codecs.raw_unicode_escape_decode
2301 check = coding_checker(self, decode)
2302 for b in range(256):
2303 if b not in b'uU':
2304 check(b'\\' + bytes([b]), '\\' + chr(b))
2305 check(br"\u20ac", "\u20ac")
2306 check(br"\U0001d120", "\U0001d120")
2307
2308 def test_decode_errors(self):
2309 decode = codecs.raw_unicode_escape_decode
2310 for c, d in (b'u', 4), (b'U', 4):
2311 for i in range(d):
2312 self.assertRaises(UnicodeDecodeError, decode,
2313 b"\\" + c + b"0"*i)
2314 self.assertRaises(UnicodeDecodeError, decode,
2315 b"[\\" + c + b"0"*i + b"]")
2316 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2317 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2318 self.assertEqual(decode(data, "replace"),
2319 ("[\ufffd]\ufffd", len(data)))
2320 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2321 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2322 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2323
2324
Martin v. Löwis43c57782009-05-10 08:15:24 +00002325class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002326
2327 def test_utf8(self):
2328 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002329 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002330 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002331 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002332 b"foo\x80bar")
2333 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002334 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002335 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002336 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002337 b"\xed\xb0\x80")
2338
2339 def test_ascii(self):
2340 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002341 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002342 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002343 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002344 b"foo\x80bar")
2345
2346 def test_charmap(self):
2347 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002348 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002349 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002350 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002351 b"foo\xa5bar")
2352
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002353 def test_latin1(self):
2354 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002355 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002356 b"\xe4\xeb\xef\xf6\xfc")
2357
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002358
Victor Stinner3fed0872010-05-22 02:16:27 +00002359class BomTest(unittest.TestCase):
2360 def test_seek0(self):
2361 data = "1234567890"
2362 tests = ("utf-16",
2363 "utf-16-le",
2364 "utf-16-be",
2365 "utf-32",
2366 "utf-32-le",
2367 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002368 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002369 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002370 # Check if the BOM is written only once
2371 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002372 f.write(data)
2373 f.write(data)
2374 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002375 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002376 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002377 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002378
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002379 # Check that the BOM is written after a seek(0)
2380 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2381 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002382 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002383 f.seek(0)
2384 f.write(data)
2385 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002386 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002387
2388 # (StreamWriter) Check that the BOM is written after a seek(0)
2389 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002390 f.writer.write(data[0])
2391 self.assertNotEqual(f.writer.tell(), 0)
2392 f.writer.seek(0)
2393 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002394 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002395 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002396
Victor Stinner05010702011-05-27 16:50:40 +02002397 # Check that the BOM is not written after a seek() at a position
2398 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002399 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2400 f.write(data)
2401 f.seek(f.tell())
2402 f.write(data)
2403 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002404 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002405
Victor Stinner05010702011-05-27 16:50:40 +02002406 # (StreamWriter) Check that the BOM is not written after a seek()
2407 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002408 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002409 f.writer.write(data)
2410 f.writer.seek(f.writer.tell())
2411 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002412 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002413 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002414
Victor Stinner3fed0872010-05-22 02:16:27 +00002415
Georg Brandl02524622010-12-02 18:06:51 +00002416bytes_transform_encodings = [
2417 "base64_codec",
2418 "uu_codec",
2419 "quopri_codec",
2420 "hex_codec",
2421]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002422
2423transform_aliases = {
2424 "base64_codec": ["base64", "base_64"],
2425 "uu_codec": ["uu"],
2426 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2427 "hex_codec": ["hex"],
2428 "rot_13": ["rot13"],
2429}
2430
Georg Brandl02524622010-12-02 18:06:51 +00002431try:
2432 import zlib
2433except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002434 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002435else:
2436 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002437 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002438try:
2439 import bz2
2440except ImportError:
2441 pass
2442else:
2443 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002444 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002445
2446class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002447
Georg Brandl02524622010-12-02 18:06:51 +00002448 def test_basics(self):
2449 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002450 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002451 with self.subTest(encoding=encoding):
2452 # generic codecs interface
2453 (o, size) = codecs.getencoder(encoding)(binput)
2454 self.assertEqual(size, len(binput))
2455 (i, size) = codecs.getdecoder(encoding)(o)
2456 self.assertEqual(size, len(o))
2457 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002458
Georg Brandl02524622010-12-02 18:06:51 +00002459 def test_read(self):
2460 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002461 with self.subTest(encoding=encoding):
2462 sin = codecs.encode(b"\x80", encoding)
2463 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2464 sout = reader.read()
2465 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002466
2467 def test_readline(self):
2468 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002469 with self.subTest(encoding=encoding):
2470 sin = codecs.encode(b"\x80", encoding)
2471 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2472 sout = reader.readline()
2473 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002474
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002475 def test_buffer_api_usage(self):
2476 # We check all the transform codecs accept memoryview input
2477 # for encoding and decoding
2478 # and also that they roundtrip correctly
2479 original = b"12345\x80"
2480 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002481 with self.subTest(encoding=encoding):
2482 data = original
2483 view = memoryview(data)
2484 data = codecs.encode(data, encoding)
2485 view_encoded = codecs.encode(view, encoding)
2486 self.assertEqual(view_encoded, data)
2487 view = memoryview(data)
2488 data = codecs.decode(data, encoding)
2489 self.assertEqual(data, original)
2490 view_decoded = codecs.decode(view, encoding)
2491 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002492
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002493 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002494 # Check binary -> binary codecs give a good error for str input
2495 bad_input = "bad input type"
2496 for encoding in bytes_transform_encodings:
2497 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002498 fmt = ( "{!r} is not a text encoding; "
2499 "use codecs.encode\(\) to handle arbitrary codecs")
2500 msg = fmt.format(encoding)
2501 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002502 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002503 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002504
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002505 def test_text_to_binary_blacklists_text_transforms(self):
2506 # Check str.encode gives a good error message for str -> str codecs
2507 msg = (r"^'rot_13' is not a text encoding; "
2508 "use codecs.encode\(\) to handle arbitrary codecs")
2509 with self.assertRaisesRegex(LookupError, msg):
2510 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002511
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002512 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002513 # Check bytes.decode and bytearray.decode give a good error
2514 # message for binary -> binary codecs
2515 data = b"encode first to ensure we meet any format restrictions"
2516 for encoding in bytes_transform_encodings:
2517 with self.subTest(encoding=encoding):
2518 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002519 fmt = (r"{!r} is not a text encoding; "
2520 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002521 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002522 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002523 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002524 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002525 bytearray(encoded_data).decode(encoding)
2526
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002527 def test_binary_to_text_blacklists_text_transforms(self):
2528 # Check str -> str codec gives a good error for binary input
2529 for bad_input in (b"immutable", bytearray(b"mutable")):
2530 with self.subTest(bad_input=bad_input):
2531 msg = (r"^'rot_13' is not a text encoding; "
2532 "use codecs.decode\(\) to handle arbitrary codecs")
2533 with self.assertRaisesRegex(LookupError, msg) as failure:
2534 bad_input.decode("rot_13")
2535 self.assertIsNone(failure.exception.__cause__)
2536
Zachary Wareefa2e042013-12-30 14:54:11 -06002537 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002538 def test_custom_zlib_error_is_wrapped(self):
2539 # Check zlib codec gives a good error for malformed input
2540 msg = "^decoding with 'zlib_codec' codec failed"
2541 with self.assertRaisesRegex(Exception, msg) as failure:
2542 codecs.decode(b"hello", "zlib_codec")
2543 self.assertIsInstance(failure.exception.__cause__,
2544 type(failure.exception))
2545
2546 def test_custom_hex_error_is_wrapped(self):
2547 # Check hex codec gives a good error for malformed input
2548 msg = "^decoding with 'hex_codec' codec failed"
2549 with self.assertRaisesRegex(Exception, msg) as failure:
2550 codecs.decode(b"hello", "hex_codec")
2551 self.assertIsInstance(failure.exception.__cause__,
2552 type(failure.exception))
2553
2554 # Unfortunately, the bz2 module throws OSError, which the codec
2555 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002556
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002557 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2558 def test_aliases(self):
2559 for codec_name, aliases in transform_aliases.items():
2560 expected_name = codecs.lookup(codec_name).name
2561 for alias in aliases:
2562 with self.subTest(alias=alias):
2563 info = codecs.lookup(alias)
2564 self.assertEqual(info.name, expected_name)
2565
Nick Coghlan8b097b42013-11-13 23:49:21 +10002566
2567# The codec system tries to wrap exceptions in order to ensure the error
2568# mentions the operation being performed and the codec involved. We
2569# currently *only* want this to happen for relatively stateless
2570# exceptions, where the only significant information they contain is their
2571# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002572
2573# Use a local codec registry to avoid appearing to leak objects when
2574# registering multiple seach functions
2575_TEST_CODECS = {}
2576
2577def _get_test_codec(codec_name):
2578 return _TEST_CODECS.get(codec_name)
2579codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2580
Nick Coghlan8fad1672014-09-15 23:50:44 +12002581try:
2582 # Issue #22166: Also need to clear the internal cache in CPython
2583 from _codecs import _forget_codec
2584except ImportError:
2585 def _forget_codec(codec_name):
2586 pass
2587
2588
Nick Coghlan8b097b42013-11-13 23:49:21 +10002589class ExceptionChainingTest(unittest.TestCase):
2590
2591 def setUp(self):
2592 # There's no way to unregister a codec search function, so we just
2593 # ensure we render this one fairly harmless after the test
2594 # case finishes by using the test case repr as the codec name
2595 # The codecs module normalizes codec names, although this doesn't
2596 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002597 # We also make sure we use a truly unique id for the custom codec
2598 # to avoid issues with the codec cache when running these tests
2599 # multiple times (e.g. when hunting for refleaks)
2600 unique_id = repr(self) + str(id(self))
2601 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2602
2603 # We store the object to raise on the instance because of a bad
2604 # interaction between the codec caching (which means we can't
2605 # recreate the codec entry) and regrtest refleak hunting (which
2606 # runs the same test instance multiple times). This means we
2607 # need to ensure the codecs call back in to the instance to find
2608 # out which exception to raise rather than binding them in a
2609 # closure to an object that may change on the next run
2610 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002611
Nick Coghlan4e553e22013-11-16 00:35:34 +10002612 def tearDown(self):
2613 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002614 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2615 encodings._cache.pop(self.codec_name, None)
2616 try:
2617 _forget_codec(self.codec_name)
2618 except KeyError:
2619 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002620
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002621 def set_codec(self, encode, decode):
2622 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002623 name=self.codec_name)
2624 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002625
2626 @contextlib.contextmanager
2627 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002628 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002629 operation, self.codec_name, exc_type.__name__, msg)
2630 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2631 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002632 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002633 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002634
2635 def raise_obj(self, *args, **kwds):
2636 # Helper to dynamically change the object raised by a test codec
2637 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002638
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002639 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 self.obj_to_raise = obj_to_raise
2641 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002642 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002643 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002644 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002645 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002646 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002647 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002648 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002649 codecs.decode(b"bytes input", self.codec_name)
2650
2651 def test_raise_by_type(self):
2652 self.check_wrapped(RuntimeError, "")
2653
2654 def test_raise_by_value(self):
2655 msg = "This should be wrapped"
2656 self.check_wrapped(RuntimeError(msg), msg)
2657
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002658 def test_raise_grandchild_subclass_exact_size(self):
2659 msg = "This should be wrapped"
2660 class MyRuntimeError(RuntimeError):
2661 __slots__ = ()
2662 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2663
2664 def test_raise_subclass_with_weakref_support(self):
2665 msg = "This should be wrapped"
2666 class MyRuntimeError(RuntimeError):
2667 pass
2668 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2669
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002670 def check_not_wrapped(self, obj_to_raise, msg):
2671 def raise_obj(*args, **kwds):
2672 raise obj_to_raise
2673 self.set_codec(raise_obj, raise_obj)
2674 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002675 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002677 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002678 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002679 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002680 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002681 codecs.decode(b"bytes input", self.codec_name)
2682
2683 def test_init_override_is_not_wrapped(self):
2684 class CustomInit(RuntimeError):
2685 def __init__(self):
2686 pass
2687 self.check_not_wrapped(CustomInit, "")
2688
2689 def test_new_override_is_not_wrapped(self):
2690 class CustomNew(RuntimeError):
2691 def __new__(cls):
2692 return super().__new__(cls)
2693 self.check_not_wrapped(CustomNew, "")
2694
2695 def test_instance_attribute_is_not_wrapped(self):
2696 msg = "This should NOT be wrapped"
2697 exc = RuntimeError(msg)
2698 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002699 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002700
2701 def test_non_str_arg_is_not_wrapped(self):
2702 self.check_not_wrapped(RuntimeError(1), "1")
2703
2704 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002705 msg_re = r"^\('a', 'b', 'c'\)$"
2706 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002707
2708 # http://bugs.python.org/issue19609
2709 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002710 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002711 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002712 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002713 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002714 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002715 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002716 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002717 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002718 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002719 codecs.decode(b"bytes input", self.codec_name)
2720
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002721 def test_unflagged_non_text_codec_handling(self):
2722 # The stdlib non-text codecs are now marked so they're
2723 # pre-emptively skipped by the text model related methods
2724 # However, third party codecs won't be flagged, so we still make
2725 # sure the case where an inappropriate output type is produced is
2726 # handled appropriately
2727 def encode_to_str(*args, **kwds):
2728 return "not bytes!", 0
2729 def decode_to_bytes(*args, **kwds):
2730 return b"not str!", 0
2731 self.set_codec(encode_to_str, decode_to_bytes)
2732 # No input or output type checks on the codecs module functions
2733 encoded = codecs.encode(None, self.codec_name)
2734 self.assertEqual(encoded, "not bytes!")
2735 decoded = codecs.decode(None, self.codec_name)
2736 self.assertEqual(decoded, b"not str!")
2737 # Text model methods should complain
2738 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2739 "use codecs.encode\(\) to encode to arbitrary types$")
2740 msg = fmt.format(self.codec_name)
2741 with self.assertRaisesRegex(TypeError, msg):
2742 "str_input".encode(self.codec_name)
2743 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2744 "use codecs.decode\(\) to decode to arbitrary types$")
2745 msg = fmt.format(self.codec_name)
2746 with self.assertRaisesRegex(TypeError, msg):
2747 b"bytes input".decode(self.codec_name)
2748
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002749
Georg Brandl02524622010-12-02 18:06:51 +00002750
Victor Stinner62be4fb2011-10-18 21:46:37 +02002751@unittest.skipUnless(sys.platform == 'win32',
2752 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002753class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002754 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002755 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002756
Victor Stinner3a50e702011-10-18 21:21:00 +02002757 def test_invalid_code_page(self):
2758 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2759 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002760 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2761 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002762
2763 def test_code_page_name(self):
2764 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2765 codecs.code_page_encode, 932, '\xff')
2766 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2767 codecs.code_page_decode, 932, b'\x81\x00')
2768 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2769 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2770
2771 def check_decode(self, cp, tests):
2772 for raw, errors, expected in tests:
2773 if expected is not None:
2774 try:
2775 decoded = codecs.code_page_decode(cp, raw, errors)
2776 except UnicodeDecodeError as err:
2777 self.fail('Unable to decode %a from "cp%s" with '
2778 'errors=%r: %s' % (raw, cp, errors, err))
2779 self.assertEqual(decoded[0], expected,
2780 '%a.decode("cp%s", %r)=%a != %a'
2781 % (raw, cp, errors, decoded[0], expected))
2782 # assert 0 <= decoded[1] <= len(raw)
2783 self.assertGreaterEqual(decoded[1], 0)
2784 self.assertLessEqual(decoded[1], len(raw))
2785 else:
2786 self.assertRaises(UnicodeDecodeError,
2787 codecs.code_page_decode, cp, raw, errors)
2788
2789 def check_encode(self, cp, tests):
2790 for text, errors, expected in tests:
2791 if expected is not None:
2792 try:
2793 encoded = codecs.code_page_encode(cp, text, errors)
2794 except UnicodeEncodeError as err:
2795 self.fail('Unable to encode %a to "cp%s" with '
2796 'errors=%r: %s' % (text, cp, errors, err))
2797 self.assertEqual(encoded[0], expected,
2798 '%a.encode("cp%s", %r)=%a != %a'
2799 % (text, cp, errors, encoded[0], expected))
2800 self.assertEqual(encoded[1], len(text))
2801 else:
2802 self.assertRaises(UnicodeEncodeError,
2803 codecs.code_page_encode, cp, text, errors)
2804
2805 def test_cp932(self):
2806 self.check_encode(932, (
2807 ('abc', 'strict', b'abc'),
2808 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002809 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002810 ('\xff', 'strict', None),
2811 ('[\xff]', 'ignore', b'[]'),
2812 ('[\xff]', 'replace', b'[y]'),
2813 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002814 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2815 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002816 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002817 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002818 (b'abc', 'strict', 'abc'),
2819 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2820 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002821 (b'[\xff]', 'strict', None),
2822 (b'[\xff]', 'ignore', '[]'),
2823 (b'[\xff]', 'replace', '[\ufffd]'),
2824 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002825 (b'\x81\x00abc', 'strict', None),
2826 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002827 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2828 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002829
2830 def test_cp1252(self):
2831 self.check_encode(1252, (
2832 ('abc', 'strict', b'abc'),
2833 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2834 ('\xff', 'strict', b'\xff'),
2835 ('\u0141', 'strict', None),
2836 ('\u0141', 'ignore', b''),
2837 ('\u0141', 'replace', b'L'),
2838 ))
2839 self.check_decode(1252, (
2840 (b'abc', 'strict', 'abc'),
2841 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2842 (b'\xff', 'strict', '\xff'),
2843 ))
2844
2845 def test_cp_utf7(self):
2846 cp = 65000
2847 self.check_encode(cp, (
2848 ('abc', 'strict', b'abc'),
2849 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2850 ('\U0010ffff', 'strict', b'+2//f/w-'),
2851 ('\udc80', 'strict', b'+3IA-'),
2852 ('\ufffd', 'strict', b'+//0-'),
2853 ))
2854 self.check_decode(cp, (
2855 (b'abc', 'strict', 'abc'),
2856 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2857 (b'+2//f/w-', 'strict', '\U0010ffff'),
2858 (b'+3IA-', 'strict', '\udc80'),
2859 (b'+//0-', 'strict', '\ufffd'),
2860 # invalid bytes
2861 (b'[+/]', 'strict', '[]'),
2862 (b'[\xff]', 'strict', '[\xff]'),
2863 ))
2864
Victor Stinner3a50e702011-10-18 21:21:00 +02002865 def test_multibyte_encoding(self):
2866 self.check_decode(932, (
2867 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2868 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2869 ))
2870 self.check_decode(self.CP_UTF8, (
2871 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2872 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2873 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002874 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002875 self.check_encode(self.CP_UTF8, (
2876 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2877 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2878 ))
2879
2880 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002881 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2882 self.assertEqual(decoded, ('', 0))
2883
Victor Stinner3a50e702011-10-18 21:21:00 +02002884 decoded = codecs.code_page_decode(932,
2885 b'\xe9\x80\xe9', 'strict',
2886 False)
2887 self.assertEqual(decoded, ('\u9a3e', 2))
2888
2889 decoded = codecs.code_page_decode(932,
2890 b'\xe9\x80\xe9\x80', 'strict',
2891 False)
2892 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2893
2894 decoded = codecs.code_page_decode(932,
2895 b'abc', 'strict',
2896 False)
2897 self.assertEqual(decoded, ('abc', 3))
2898
2899
Fred Drake2e2be372001-09-20 21:33:42 +00002900if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002901 unittest.main()