blob: 8b78c240415599330088074bdc108e23042521c9 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
352 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
353 "[&#56448;]".encode(self.encoding))
354 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
355 "[]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
357 "[?]".encode(self.encoding))
358
359 bom = "".encode(self.encoding)
360 for before, after in [("\U00010fff", "A"), ("[", "]"),
361 ("A", "\U00010fff")]:
362 before_sequence = before.encode(self.encoding)[len(bom):]
363 after_sequence = after.encode(self.encoding)[len(bom):]
364 test_string = before + "\uDC80" + after
365 test_sequence = (bom + before_sequence +
366 self.ill_formed_sequence + after_sequence)
367 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
368 self.encoding)
369 self.assertEqual(test_string.encode(self.encoding,
370 "surrogatepass"),
371 test_sequence)
372 self.assertEqual(test_sequence.decode(self.encoding,
373 "surrogatepass"),
374 test_string)
375 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
376 before + after)
377 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
378 before + self.ill_formed_sequence_replace + after)
379
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200380class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000381 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200382 if sys.byteorder == 'little':
383 ill_formed_sequence = b"\x80\xdc\x00\x00"
384 else:
385 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000386
387 spamle = (b'\xff\xfe\x00\x00'
388 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
389 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
390 spambe = (b'\x00\x00\xfe\xff'
391 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
392 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
393
394 def test_only_one_bom(self):
395 _,_,reader,writer = codecs.lookup(self.encoding)
396 # encode some stream
397 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200398 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 f.write("spam")
400 f.write("spam")
401 d = s.getvalue()
402 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000403 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 # try to read it back
405 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200406 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000407 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000408
409 def test_badbom(self):
410 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000412 self.assertRaises(UnicodeError, f.read)
413
414 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200415 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416 self.assertRaises(UnicodeError, f.read)
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "", # first byte of BOM read
423 "", # second byte of BOM read
424 "", # third byte of BOM read
425 "", # fourth byte of BOM read => byteorder known
426 "",
427 "",
428 "",
429 "\x00",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00\xff",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff\u0100",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff",
443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 ]
447 )
448
Georg Brandl791f4e12009-09-17 11:41:24 +0000449 def test_handlers(self):
450 self.assertEqual(('\ufffd', 1),
451 codecs.utf_32_decode(b'\x01', 'replace', True))
452 self.assertEqual(('', 1),
453 codecs.utf_32_decode(b'\x01', 'ignore', True))
454
Walter Dörwald41980ca2007-08-16 21:55:45 +0000455 def test_errors(self):
456 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
457 b"\xff", "strict", True)
458
459 def test_decoder_state(self):
460 self.check_state_handling_decode(self.encoding,
461 "spamspam", self.spamle)
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spambe)
464
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000465 def test_issue8941(self):
466 # Issue #8941: insufficient result allocation when decoding into
467 # surrogate pairs on UCS-2 builds.
468 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
469 self.assertEqual('\U00010000' * 1024,
470 codecs.utf_32_decode(encoded_le)[0])
471 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
472 self.assertEqual('\U00010000' * 1024,
473 codecs.utf_32_decode(encoded_be)[0])
474
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200475class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200477 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478
479 def test_partial(self):
480 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200481 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 [
483 "",
484 "",
485 "",
486 "\x00",
487 "\x00",
488 "\x00",
489 "\x00",
490 "\x00\xff",
491 "\x00\xff",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff\u0100",
495 "\x00\xff\u0100",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200499 "\x00\xff\u0100\uffff",
500 "\x00\xff\u0100\uffff",
501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000503 ]
504 )
505
506 def test_simple(self):
507 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
508
509 def test_errors(self):
510 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
511 b"\xff", "strict", True)
512
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000513 def test_issue8941(self):
514 # Issue #8941: insufficient result allocation when decoding into
515 # surrogate pairs on UCS-2 builds.
516 encoded = b'\x00\x00\x01\x00' * 1024
517 self.assertEqual('\U00010000' * 1024,
518 codecs.utf_32_le_decode(encoded)[0])
519
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200520class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200522 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523
524 def test_partial(self):
525 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 [
528 "",
529 "",
530 "",
531 "\x00",
532 "\x00",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff\u0100",
540 "\x00\xff\u0100",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200544 "\x00\xff\u0100\uffff",
545 "\x00\xff\u0100\uffff",
546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000548 ]
549 )
550
551 def test_simple(self):
552 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
553
554 def test_errors(self):
555 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
556 b"\xff", "strict", True)
557
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000558 def test_issue8941(self):
559 # Issue #8941: insufficient result allocation when decoding into
560 # surrogate pairs on UCS-2 builds.
561 encoded = b'\x00\x01\x00\x00' * 1024
562 self.assertEqual('\U00010000' * 1024,
563 codecs.utf_32_be_decode(encoded)[0])
564
565
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200566class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000567 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200568 if sys.byteorder == 'little':
569 ill_formed_sequence = b"\x80\xdc"
570 else:
571 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000572
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000573 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
574 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000575
576 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000579 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200580 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000581 f.write("spam")
582 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000583 d = s.getvalue()
584 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000585 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000586 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000587 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200588 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000589 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000591 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000592 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200593 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000594 self.assertRaises(UnicodeError, f.read)
595
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000596 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200597 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000598 self.assertRaises(UnicodeError, f.read)
599
Walter Dörwald69652032004-09-07 20:24:22 +0000600 def test_partial(self):
601 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200602 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000603 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000604 "", # first byte of BOM read
605 "", # second byte of BOM read => byteorder known
606 "",
607 "\x00",
608 "\x00",
609 "\x00\xff",
610 "\x00\xff",
611 "\x00\xff\u0100",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000618 ]
619 )
620
Georg Brandl791f4e12009-09-17 11:41:24 +0000621 def test_handlers(self):
622 self.assertEqual(('\ufffd', 1),
623 codecs.utf_16_decode(b'\x01', 'replace', True))
624 self.assertEqual(('', 1),
625 codecs.utf_16_decode(b'\x01', 'ignore', True))
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000628 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000629 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630
631 def test_decoder_state(self):
632 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000637 def test_bug691291(self):
638 # Files are always opened in binary mode, even if no binary mode was
639 # specified. This means that no automatic conversion of '\n' is done
640 # on reading and writing.
641 s1 = 'Hello\r\nworld\r\n'
642
643 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200644 self.addCleanup(support.unlink, support.TESTFN)
645 with open(support.TESTFN, 'wb') as fp:
646 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200647 with support.check_warnings(('', DeprecationWarning)):
648 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
649 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000651
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200652class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000653 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000655
656 def test_partial(self):
657 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200658 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000659 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "",
661 "\x00",
662 "\x00",
663 "\x00\xff",
664 "\x00\xff",
665 "\x00\xff\u0100",
666 "\x00\xff\u0100",
667 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200668 "\x00\xff\u0100\uffff",
669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000672 ]
673 )
674
Walter Dörwalde22d3392005-11-17 08:52:34 +0000675 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200676 tests = [
677 (b'\xff', '\ufffd'),
678 (b'A\x00Z', 'A\ufffd'),
679 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
680 (b'\x00\xd8', '\ufffd'),
681 (b'\x00\xd8A', '\ufffd'),
682 (b'\x00\xd8A\x00', '\ufffdA'),
683 (b'\x00\xdcA\x00', '\ufffdA'),
684 ]
685 for raw, expected in tests:
686 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
687 raw, 'strict', True)
688 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000689
Victor Stinner53a9dd72010-12-08 22:25:45 +0000690 def test_nonbmp(self):
691 self.assertEqual("\U00010203".encode(self.encoding),
692 b'\x00\xd8\x03\xde')
693 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
694 "\U00010203")
695
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200696class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000697 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200698 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000699
700 def test_partial(self):
701 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200702 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000703 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000704 "",
705 "\x00",
706 "\x00",
707 "\x00\xff",
708 "\x00\xff",
709 "\x00\xff\u0100",
710 "\x00\xff\u0100",
711 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200712 "\x00\xff\u0100\uffff",
713 "\x00\xff\u0100\uffff",
714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000716 ]
717 )
718
Walter Dörwalde22d3392005-11-17 08:52:34 +0000719 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200720 tests = [
721 (b'\xff', '\ufffd'),
722 (b'\x00A\xff', 'A\ufffd'),
723 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
724 (b'\xd8\x00', '\ufffd'),
725 (b'\xd8\x00\xdc', '\ufffd'),
726 (b'\xd8\x00\x00A', '\ufffdA'),
727 (b'\xdc\x00\x00A', '\ufffdA'),
728 ]
729 for raw, expected in tests:
730 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
731 raw, 'strict', True)
732 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000733
Victor Stinner53a9dd72010-12-08 22:25:45 +0000734 def test_nonbmp(self):
735 self.assertEqual("\U00010203".encode(self.encoding),
736 b'\xd8\x00\xde\x03')
737 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
738 "\U00010203")
739
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200740class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000741 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200742 ill_formed_sequence = b"\xed\xb2\x80"
743 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000744
745 def test_partial(self):
746 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200747 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000749 "\x00",
750 "\x00",
751 "\x00\xff",
752 "\x00\xff",
753 "\x00\xff\u07ff",
754 "\x00\xff\u07ff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff\u0800",
757 "\x00\xff\u07ff\u0800",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200760 "\x00\xff\u07ff\u0800\uffff",
761 "\x00\xff\u07ff\u0800\uffff",
762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000764 ]
765 )
766
Walter Dörwald3abcb012007-04-16 22:10:50 +0000767 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 self.check_state_handling_decode(self.encoding,
770 u, u.encode(self.encoding))
771
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000772 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200773 super().test_lone_surrogates()
774 # not sure if this is making sense for
775 # UTF-16 and UTF-32
776 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000777 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000779 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000780 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
781 b"abc\xed\xa0\x80def")
782 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
783 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
785 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
786 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
787 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000788 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700789 with self.assertRaises(UnicodeDecodeError):
790 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000793
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200794@unittest.skipUnless(sys.platform == 'win32',
795 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200796class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200797 encoding = "cp65001"
798
799 def test_encode(self):
800 tests = [
801 ('abc', 'strict', b'abc'),
802 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
803 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
804 ]
805 if VISTA_OR_LATER:
806 tests.extend((
807 ('\udc80', 'strict', None),
808 ('\udc80', 'ignore', b''),
809 ('\udc80', 'replace', b'?'),
810 ('\udc80', 'backslashreplace', b'\\udc80'),
811 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
812 ))
813 else:
814 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
815 for text, errors, expected in tests:
816 if expected is not None:
817 try:
818 encoded = text.encode('cp65001', errors)
819 except UnicodeEncodeError as err:
820 self.fail('Unable to encode %a to cp65001 with '
821 'errors=%r: %s' % (text, errors, err))
822 self.assertEqual(encoded, expected,
823 '%a.encode("cp65001", %r)=%a != %a'
824 % (text, errors, encoded, expected))
825 else:
826 self.assertRaises(UnicodeEncodeError,
827 text.encode, "cp65001", errors)
828
829 def test_decode(self):
830 tests = [
831 (b'abc', 'strict', 'abc'),
832 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
833 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
834 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
835 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
836 # invalid bytes
837 (b'[\xff]', 'strict', None),
838 (b'[\xff]', 'ignore', '[]'),
839 (b'[\xff]', 'replace', '[\ufffd]'),
840 (b'[\xff]', 'surrogateescape', '[\udcff]'),
841 ]
842 if VISTA_OR_LATER:
843 tests.extend((
844 (b'[\xed\xb2\x80]', 'strict', None),
845 (b'[\xed\xb2\x80]', 'ignore', '[]'),
846 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
847 ))
848 else:
849 tests.extend((
850 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
851 ))
852 for raw, errors, expected in tests:
853 if expected is not None:
854 try:
855 decoded = raw.decode('cp65001', errors)
856 except UnicodeDecodeError as err:
857 self.fail('Unable to decode %a from cp65001 with '
858 'errors=%r: %s' % (raw, errors, err))
859 self.assertEqual(decoded, expected,
860 '%a.decode("cp65001", %r)=%a != %a'
861 % (raw, errors, decoded, expected))
862 else:
863 self.assertRaises(UnicodeDecodeError,
864 raw.decode, 'cp65001', errors)
865
866 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
867 def test_lone_surrogates(self):
868 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
869 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
870 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
871 b'[\\udc80]')
872 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
873 b'[&#56448;]')
874 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
875 b'[\x80]')
876 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
877 b'[]')
878 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
879 b'[?]')
880
881 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
882 def test_surrogatepass_handler(self):
883 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
884 b"abc\xed\xa0\x80def")
885 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
886 "abc\ud800def")
887 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
888 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
889 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
890 "\U00010fff\uD800")
891 self.assertTrue(codecs.lookup_error("surrogatepass"))
892
Victor Stinner3633ce32014-02-09 13:11:53 +0100893 def test_readline(self):
894 self.skipTest("issue #20571: code page 65001 codec does not "
895 "support partial decoder yet")
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200896
897
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200898class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000899 encoding = "utf-7"
900
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000901 def test_partial(self):
902 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200903 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000904 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200905 'a',
906 'a',
907 'a+',
908 'a+-',
909 'a+-b',
910 'a+-b',
911 'a+-b',
912 'a+-b',
913 'a+-b',
914 'a+-b\x00',
915 'a+-b\x00c',
916 'a+-b\x00c',
917 'a+-b\x00c',
918 'a+-b\x00c',
919 'a+-b\x00c',
920 'a+-b\x00c\x80',
921 'a+-b\x00c\x80d',
922 'a+-b\x00c\x80d',
923 'a+-b\x00c\x80d',
924 'a+-b\x00c\x80d',
925 'a+-b\x00c\x80d',
926 'a+-b\x00c\x80d\u0100',
927 'a+-b\x00c\x80d\u0100e',
928 'a+-b\x00c\x80d\u0100e',
929 'a+-b\x00c\x80d\u0100e',
930 'a+-b\x00c\x80d\u0100e',
931 'a+-b\x00c\x80d\u0100e',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e\U00010000',
936 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000937 ]
938 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000939
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300940 def test_errors(self):
941 tests = [
942 (b'a\xffb', 'a\ufffdb'),
943 (b'a+IK', 'a\ufffd'),
944 (b'a+IK-b', 'a\ufffdb'),
945 (b'a+IK,b', 'a\ufffdb'),
946 (b'a+IKx', 'a\u20ac\ufffd'),
947 (b'a+IKx-b', 'a\u20ac\ufffdb'),
948 (b'a+IKwgr', 'a\u20ac\ufffd'),
949 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
950 (b'a+IKwgr,', 'a\u20ac\ufffd'),
951 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
952 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
953 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
954 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
955 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
956 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
957 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
958 ]
959 for raw, expected in tests:
960 with self.subTest(raw=raw):
961 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
962 raw, 'strict', True)
963 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
964
965 def test_nonbmp(self):
966 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
967 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
968 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
969
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200970 test_lone_surrogates = None
971
972
Walter Dörwalde22d3392005-11-17 08:52:34 +0000973class UTF16ExTest(unittest.TestCase):
974
975 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000976 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000977
978 def test_bad_args(self):
979 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
980
981class ReadBufferTest(unittest.TestCase):
982
983 def test_array(self):
984 import array
985 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000986 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000987 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000988 )
989
990 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000991 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000992
993 def test_bad_args(self):
994 self.assertRaises(TypeError, codecs.readbuffer_encode)
995 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
996
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200997class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000998 encoding = "utf-8-sig"
999
1000 def test_partial(self):
1001 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001002 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001003 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001004 "",
1005 "",
1006 "", # First BOM has been read and skipped
1007 "",
1008 "",
1009 "\ufeff", # Second BOM has been read and emitted
1010 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001011 "\ufeff\x00", # First byte of encoded "\xff" read
1012 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1013 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1014 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001015 "\ufeff\x00\xff\u07ff",
1016 "\ufeff\x00\xff\u07ff",
1017 "\ufeff\x00\xff\u07ff\u0800",
1018 "\ufeff\x00\xff\u07ff\u0800",
1019 "\ufeff\x00\xff\u07ff\u0800",
1020 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001021 "\ufeff\x00\xff\u07ff\u0800\uffff",
1022 "\ufeff\x00\xff\u07ff\u0800\uffff",
1023 "\ufeff\x00\xff\u07ff\u0800\uffff",
1024 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001025 ]
1026 )
1027
Thomas Wouters89f507f2006-12-13 04:49:30 +00001028 def test_bug1601501(self):
1029 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001030 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001031
Walter Dörwald3abcb012007-04-16 22:10:50 +00001032 def test_bom(self):
1033 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001035 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1036
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001037 def test_stream_bom(self):
1038 unistring = "ABC\u00A1\u2200XYZ"
1039 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1040
1041 reader = codecs.getreader("utf-8-sig")
1042 for sizehint in [None] + list(range(1, 11)) + \
1043 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001044 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001045 ostream = io.StringIO()
1046 while 1:
1047 if sizehint is not None:
1048 data = istream.read(sizehint)
1049 else:
1050 data = istream.read()
1051
1052 if not data:
1053 break
1054 ostream.write(data)
1055
1056 got = ostream.getvalue()
1057 self.assertEqual(got, unistring)
1058
1059 def test_stream_bare(self):
1060 unistring = "ABC\u00A1\u2200XYZ"
1061 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1062
1063 reader = codecs.getreader("utf-8-sig")
1064 for sizehint in [None] + list(range(1, 11)) + \
1065 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001066 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001067 ostream = io.StringIO()
1068 while 1:
1069 if sizehint is not None:
1070 data = istream.read(sizehint)
1071 else:
1072 data = istream.read()
1073
1074 if not data:
1075 break
1076 ostream.write(data)
1077
1078 got = ostream.getvalue()
1079 self.assertEqual(got, unistring)
1080
1081class EscapeDecodeTest(unittest.TestCase):
1082 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001083 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001084
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001085 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001086 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001087 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001088 b = bytes([b])
1089 if b != b'\\':
1090 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001091
1092 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001093 decode = codecs.escape_decode
1094 check = coding_checker(self, decode)
1095 check(b"[\\\n]", b"[]")
1096 check(br'[\"]', b'["]')
1097 check(br"[\']", b"[']")
1098 check(br"[\\]", br"[\]")
1099 check(br"[\a]", b"[\x07]")
1100 check(br"[\b]", b"[\x08]")
1101 check(br"[\t]", b"[\x09]")
1102 check(br"[\n]", b"[\x0a]")
1103 check(br"[\v]", b"[\x0b]")
1104 check(br"[\f]", b"[\x0c]")
1105 check(br"[\r]", b"[\x0d]")
1106 check(br"[\7]", b"[\x07]")
1107 check(br"[\8]", br"[\8]")
1108 check(br"[\78]", b"[\x078]")
1109 check(br"[\41]", b"[!]")
1110 check(br"[\418]", b"[!8]")
1111 check(br"[\101]", b"[A]")
1112 check(br"[\1010]", b"[A0]")
1113 check(br"[\501]", b"[A]")
1114 check(br"[\x41]", b"[A]")
1115 check(br"[\X41]", br"[\X41]")
1116 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001117 for b in range(256):
1118 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001119 b = bytes([b])
1120 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001121
1122 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001123 decode = codecs.escape_decode
1124 self.assertRaises(ValueError, decode, br"\x")
1125 self.assertRaises(ValueError, decode, br"[\x]")
1126 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1127 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1128 self.assertRaises(ValueError, decode, br"\x0")
1129 self.assertRaises(ValueError, decode, br"[\x0]")
1130 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1131 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001132
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001133class RecodingTest(unittest.TestCase):
1134 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001135 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001136 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001137 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001138 f2.close()
1139 # Python used to crash on this at exit because of a refcount
1140 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001141
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001142 self.assertTrue(f.closed)
1143
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144# From RFC 3492
1145punycode_testcases = [
1146 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001147 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1148 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001149 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001151 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001152 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001153 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001154 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001155 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1158 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1159 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001160 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1163 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1164 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001165 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001166 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001168 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1169 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1170 "\u0939\u0948\u0902",
1171 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172
1173 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001174 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001175 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1176 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001177
1178 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001179 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1180 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1181 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001182 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1183 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001184
1185 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001186 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1187 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1188 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1189 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001190 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191
1192 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001193 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1194 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1195 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1196 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1197 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001198 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001199
1200 # (K) Vietnamese:
1201 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1202 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001203 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1204 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1205 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1206 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001207 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208
Martin v. Löwis2548c732003-04-18 10:39:54 +00001209 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001210 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001211 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001212
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001214 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1215 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1216 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001217 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218
1219 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1221 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1222 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224
1225 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001227 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001228
1229 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001230 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1231 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001232 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233
1234 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001235 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001236 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001237
1238 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001239 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001240 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241
1242 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001243 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1244 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001245 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001246 ]
1247
1248for i in punycode_testcases:
1249 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001250 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251
1252class PunycodeTest(unittest.TestCase):
1253 def test_encode(self):
1254 for uni, puny in punycode_testcases:
1255 # Need to convert both strings to lower case, since
1256 # some of the extended encodings use upper case, but our
1257 # code produces only lower case. Converting just puny to
1258 # lower is also insufficient, since some of the input characters
1259 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001260 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001261 str(uni.encode("punycode"), "ascii").lower(),
1262 str(puny, "ascii").lower()
1263 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264
1265 def test_decode(self):
1266 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001267 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001268 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001269 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001271class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001272 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001273 def test_bug1251300(self):
1274 # Decoding with unicode_internal used to not correctly handle "code
1275 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001276 ok = [
1277 (b"\x00\x10\xff\xff", "\U0010ffff"),
1278 (b"\x00\x00\x01\x01", "\U00000101"),
1279 (b"", ""),
1280 ]
1281 not_ok = [
1282 b"\x7f\xff\xff\xff",
1283 b"\x80\x00\x00\x00",
1284 b"\x81\x00\x00\x00",
1285 b"\x00",
1286 b"\x00\x00\x00\x00\x00",
1287 ]
1288 for internal, uni in ok:
1289 if sys.byteorder == "little":
1290 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001291 with support.check_warnings():
1292 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001293 for internal in not_ok:
1294 if sys.byteorder == "little":
1295 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001296 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001297 'deprecated', DeprecationWarning)):
1298 self.assertRaises(UnicodeDecodeError, internal.decode,
1299 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001300 if sys.byteorder == "little":
1301 invalid = b"\x00\x00\x11\x00"
1302 else:
1303 invalid = b"\x00\x11\x00\x00"
1304 with support.check_warnings():
1305 self.assertRaises(UnicodeDecodeError,
1306 invalid.decode, "unicode_internal")
1307 with support.check_warnings():
1308 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1309 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001310
Victor Stinner182d90d2011-09-29 19:53:55 +02001311 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001312 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001313 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001314 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001315 'deprecated', DeprecationWarning)):
1316 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001317 except UnicodeDecodeError as ex:
1318 self.assertEqual("unicode_internal", ex.encoding)
1319 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1320 self.assertEqual(4, ex.start)
1321 self.assertEqual(8, ex.end)
1322 else:
1323 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001324
Victor Stinner182d90d2011-09-29 19:53:55 +02001325 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001326 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001327 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1328 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001329 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001330 'deprecated', DeprecationWarning)):
1331 ab = "ab".encode("unicode_internal").decode()
1332 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1333 "ascii"),
1334 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001335 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001336
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001337 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001338 with support.check_warnings(('unicode_internal codec has been '
1339 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001340 # Issue 3739
1341 encoder = codecs.getencoder("unicode_internal")
1342 self.assertEqual(encoder("a")[1], 1)
1343 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1344
1345 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001346
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1348nameprep_tests = [
1349 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1351 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1352 b'\xb8\x8f\xef\xbb\xbf',
1353 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001355 (b'CAFE',
1356 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357 # 3.3 Case folding 8bit U+00DF (german sharp s).
1358 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001359 (b'\xc3\x9f',
1360 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'\xc4\xb0',
1363 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'\xc5\x83\xcd\xba',
1366 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1368 # XXX: skip this as it fails in UCS-2 mode
1369 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1370 # 'telc\xe2\x88\x95kg\xcf\x83'),
1371 (None, None),
1372 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1374 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xe1\xbe\xb7',
1377 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.9 Self-reverting case folding U+01F0 and normalization.
1379 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xc7\xb0',
1381 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xce\x90',
1384 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xce\xb0',
1387 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xe1\xba\x96',
1390 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xe1\xbd\x96',
1393 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b' ',
1396 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xc2\xa0',
1399 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 None),
1403 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xe2\x80\x80',
1405 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xe2\x80\x8b',
1408 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xe3\x80\x80',
1411 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\x10\x7f',
1414 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 None),
1418 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xef\xbb\xbf',
1423 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xcd\x81',
1453 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 None),
1469 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'foo\xef\xb9\xb6bar',
1474 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 None),
1478 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'\xd8\xa71\xd8\xa8',
1480 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001481 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001482 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001484 # None),
1485 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 # 3.44 Larger test (shrinking).
1487 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1489 b'\xaa\xce\xb0\xe2\x80\x80',
1490 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001491 # 3.45 Larger test (expanding).
1492 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1494 b'\x80',
1495 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1496 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1497 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 ]
1499
1500
1501class NameprepTest(unittest.TestCase):
1502 def test_nameprep(self):
1503 from encodings.idna import nameprep
1504 for pos, (orig, prepped) in enumerate(nameprep_tests):
1505 if orig is None:
1506 # Skipped
1507 continue
1508 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001509 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 if prepped is None:
1511 # Input contains prohibited characters
1512 self.assertRaises(UnicodeError, nameprep, orig)
1513 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001514 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001515 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001516 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001517 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001518 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001520class IDNACodecTest(unittest.TestCase):
1521 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001522 self.assertEqual(str(b"python.org", "idna"), "python.org")
1523 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1524 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1525 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001526
1527 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001528 self.assertEqual("python.org".encode("idna"), b"python.org")
1529 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1530 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1531 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001532
Martin v. Löwis8b595142005-08-25 11:03:38 +00001533 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001534 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001535 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001537
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001538 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001539 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001540 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001541 "python.org"
1542 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001543 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001544 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 "python.org."
1546 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001548 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001549 "pyth\xf6n.org."
1550 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001551 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001552 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 "pyth\xf6n.org."
1554 )
1555
1556 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1558 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1559 self.assertEqual(decoder.decode(b"rg"), "")
1560 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561
1562 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001563 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1564 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1565 self.assertEqual(decoder.decode(b"rg."), "org.")
1566 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001567
1568 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001569 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001570 b"".join(codecs.iterencode("python.org", "idna")),
1571 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001573 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001574 b"".join(codecs.iterencode("python.org.", "idna")),
1575 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001576 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001577 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001578 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1579 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001580 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001581 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001582 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1583 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001584 )
1585
1586 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001587 self.assertEqual(encoder.encode("\xe4x"), b"")
1588 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1589 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001590
1591 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001592 self.assertEqual(encoder.encode("\xe4x"), b"")
1593 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1594 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001595
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001596 def test_errors(self):
1597 """Only supports "strict" error handler"""
1598 "python.org".encode("idna", "strict")
1599 b"python.org".decode("idna", "strict")
1600 for errors in ("ignore", "replace", "backslashreplace",
1601 "surrogateescape"):
1602 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1603 self.assertRaises(Exception,
1604 b"python.org".decode, "idna", errors)
1605
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001606class CodecsModuleTest(unittest.TestCase):
1607
1608 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001609 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1610 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001611 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001613 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001614
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001615 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001616 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1617 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001618 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001619 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001620 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001621 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001622
1623 def test_register(self):
1624 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001625 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001626
1627 def test_lookup(self):
1628 self.assertRaises(TypeError, codecs.lookup)
1629 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001630 self.assertRaises(LookupError, codecs.lookup, " ")
1631
1632 def test_getencoder(self):
1633 self.assertRaises(TypeError, codecs.getencoder)
1634 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1635
1636 def test_getdecoder(self):
1637 self.assertRaises(TypeError, codecs.getdecoder)
1638 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1639
1640 def test_getreader(self):
1641 self.assertRaises(TypeError, codecs.getreader)
1642 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1643
1644 def test_getwriter(self):
1645 self.assertRaises(TypeError, codecs.getwriter)
1646 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001647
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001648 def test_lookup_issue1813(self):
1649 # Issue #1813: under Turkish locales, lookup of some codecs failed
1650 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001651 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001652 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1653 try:
1654 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1655 except locale.Error:
1656 # Unsupported locale on this system
1657 self.skipTest('test needs Turkish locale')
1658 c = codecs.lookup('ASCII')
1659 self.assertEqual(c.name, 'ascii')
1660
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001661 def test_all(self):
1662 api = (
1663 "encode", "decode",
1664 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1665 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1666 "getencoder", "getdecoder", "getincrementalencoder",
1667 "getincrementaldecoder", "getreader", "getwriter",
1668 "register_error", "lookup_error",
1669 "strict_errors", "replace_errors", "ignore_errors",
1670 "xmlcharrefreplace_errors", "backslashreplace_errors",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001671 "open", "EncodedFile",
1672 "iterencode", "iterdecode",
1673 "BOM", "BOM_BE", "BOM_LE",
1674 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1675 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1676 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1677 "StreamReaderWriter", "StreamRecoder",
1678 )
1679 self.assertCountEqual(api, codecs.__all__)
1680 for api in codecs.__all__:
1681 getattr(codecs, api)
1682
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001683 def test_open(self):
1684 self.addCleanup(support.unlink, support.TESTFN)
1685 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1686 with self.subTest(mode), \
1687 codecs.open(support.TESTFN, mode, 'ascii') as file:
1688 self.assertIsInstance(file, codecs.StreamReaderWriter)
1689
1690 def test_undefined(self):
1691 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1692 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1693 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1694 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1695 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1696 self.assertRaises(UnicodeError,
1697 codecs.encode, 'abc', 'undefined', errors)
1698 self.assertRaises(UnicodeError,
1699 codecs.decode, b'abc', 'undefined', errors)
1700
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001701class StreamReaderTest(unittest.TestCase):
1702
1703 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001704 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001705 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001706
1707 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001708 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001709 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001710
Thomas Wouters89f507f2006-12-13 04:49:30 +00001711class EncodedFileTest(unittest.TestCase):
1712
1713 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001714 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001715 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001716 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001717
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001718 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001719 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001720 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001722
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001723all_unicode_encodings = [
1724 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001725 "big5",
1726 "big5hkscs",
1727 "charmap",
1728 "cp037",
1729 "cp1006",
1730 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001731 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001732 "cp1140",
1733 "cp1250",
1734 "cp1251",
1735 "cp1252",
1736 "cp1253",
1737 "cp1254",
1738 "cp1255",
1739 "cp1256",
1740 "cp1257",
1741 "cp1258",
1742 "cp424",
1743 "cp437",
1744 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001745 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001746 "cp737",
1747 "cp775",
1748 "cp850",
1749 "cp852",
1750 "cp855",
1751 "cp856",
1752 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001753 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001754 "cp860",
1755 "cp861",
1756 "cp862",
1757 "cp863",
1758 "cp864",
1759 "cp865",
1760 "cp866",
1761 "cp869",
1762 "cp874",
1763 "cp875",
1764 "cp932",
1765 "cp949",
1766 "cp950",
1767 "euc_jis_2004",
1768 "euc_jisx0213",
1769 "euc_jp",
1770 "euc_kr",
1771 "gb18030",
1772 "gb2312",
1773 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001774 "hp_roman8",
1775 "hz",
1776 "idna",
1777 "iso2022_jp",
1778 "iso2022_jp_1",
1779 "iso2022_jp_2",
1780 "iso2022_jp_2004",
1781 "iso2022_jp_3",
1782 "iso2022_jp_ext",
1783 "iso2022_kr",
1784 "iso8859_1",
1785 "iso8859_10",
1786 "iso8859_11",
1787 "iso8859_13",
1788 "iso8859_14",
1789 "iso8859_15",
1790 "iso8859_16",
1791 "iso8859_2",
1792 "iso8859_3",
1793 "iso8859_4",
1794 "iso8859_5",
1795 "iso8859_6",
1796 "iso8859_7",
1797 "iso8859_8",
1798 "iso8859_9",
1799 "johab",
1800 "koi8_r",
1801 "koi8_u",
1802 "latin_1",
1803 "mac_cyrillic",
1804 "mac_greek",
1805 "mac_iceland",
1806 "mac_latin2",
1807 "mac_roman",
1808 "mac_turkish",
1809 "palmos",
1810 "ptcp154",
1811 "punycode",
1812 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001813 "shift_jis",
1814 "shift_jis_2004",
1815 "shift_jisx0213",
1816 "tis_620",
1817 "unicode_escape",
1818 "unicode_internal",
1819 "utf_16",
1820 "utf_16_be",
1821 "utf_16_le",
1822 "utf_7",
1823 "utf_8",
1824]
1825
1826if hasattr(codecs, "mbcs_encode"):
1827 all_unicode_encodings.append("mbcs")
1828
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001829# The following encoding is not tested, because it's not supposed
1830# to work:
1831# "undefined"
1832
1833# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001834broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001835 "punycode",
1836 "unicode_internal"
1837]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001838
Walter Dörwald3abcb012007-04-16 22:10:50 +00001839class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001840 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001841 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001842 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001843 name = codecs.lookup(encoding).name
1844 if encoding.endswith("_codec"):
1845 name += "_codec"
1846 elif encoding == "latin_1":
1847 name = "latin_1"
1848 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001849
Ezio Melottiadc417c2011-11-17 12:23:34 +02001850 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001851 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001852 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001853 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001854 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001855 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001857 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001858 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001859 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001860 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001861 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001862 for c in s:
1863 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001864 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001865 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001866 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001867 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001868 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001869 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001870 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001871 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001872 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001873 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001874
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001875 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001876 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001877 try:
1878 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001879 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001880 pass
1881 else:
1882 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001883 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001884 for c in s:
1885 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001886 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001887 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001888 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001889 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001890 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001891 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001892 self.assertEqual(decodedresult, s,
1893 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001894
1895 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001896 result = "".join(codecs.iterdecode(
1897 codecs.iterencode(s, encoding), encoding))
1898 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001899
1900 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001901 result = "".join(codecs.iterdecode(
1902 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001903 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001904
Victor Stinner554f3f02010-06-16 23:33:54 +00001905 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001906 # check incremental decoder/encoder with errors argument
1907 try:
1908 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001909 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001910 pass
1911 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001912 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001913 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001914 decodedresult = "".join(decoder.decode(bytes([c]))
1915 for c in encodedresult)
1916 self.assertEqual(decodedresult, s,
1917 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001918
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001919 @support.cpython_only
1920 def test_basics_capi(self):
1921 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1922 s = "abc123" # all codecs should be able to encode these
1923 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001924 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001925 # check incremental decoder/encoder (fetched via the C API)
1926 try:
1927 cencoder = codec_incrementalencoder(encoding)
1928 except LookupError: # no IncrementalEncoder
1929 pass
1930 else:
1931 # check C API
1932 encodedresult = b""
1933 for c in s:
1934 encodedresult += cencoder.encode(c)
1935 encodedresult += cencoder.encode("", True)
1936 cdecoder = codec_incrementaldecoder(encoding)
1937 decodedresult = ""
1938 for c in encodedresult:
1939 decodedresult += cdecoder.decode(bytes([c]))
1940 decodedresult += cdecoder.decode(b"", True)
1941 self.assertEqual(decodedresult, s,
1942 "encoding=%r" % encoding)
1943
1944 if encoding not in ("idna", "mbcs"):
1945 # check incremental decoder/encoder with errors argument
1946 try:
1947 cencoder = codec_incrementalencoder(encoding, "ignore")
1948 except LookupError: # no IncrementalEncoder
1949 pass
1950 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001951 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001952 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1953 decodedresult = "".join(cdecoder.decode(bytes([c]))
1954 for c in encodedresult)
1955 self.assertEqual(decodedresult, s,
1956 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001957
Walter Dörwald729c31f2005-03-14 19:06:30 +00001958 def test_seek(self):
1959 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001960 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001961 for encoding in all_unicode_encodings:
1962 if encoding == "idna": # FIXME: See SF bug #1163178
1963 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001964 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001965 continue
Victor Stinner05010702011-05-27 16:50:40 +02001966 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001967 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001968 # Test that calling seek resets the internal codec state and buffers
1969 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001970 data = reader.read()
1971 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001972
Walter Dörwalde22d3392005-11-17 08:52:34 +00001973 def test_bad_decode_args(self):
1974 for encoding in all_unicode_encodings:
1975 decoder = codecs.getdecoder(encoding)
1976 self.assertRaises(TypeError, decoder)
1977 if encoding not in ("idna", "punycode"):
1978 self.assertRaises(TypeError, decoder, 42)
1979
1980 def test_bad_encode_args(self):
1981 for encoding in all_unicode_encodings:
1982 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001983 with support.check_warnings():
1984 # unicode-internal has been deprecated
1985 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001986
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001987 def test_encoding_map_type_initialized(self):
1988 from encodings import cp1140
1989 # This used to crash, we are only verifying there's no crash.
1990 table_type = type(cp1140.encoding_table)
1991 self.assertEqual(table_type, table_type)
1992
Walter Dörwald3abcb012007-04-16 22:10:50 +00001993 def test_decoder_state(self):
1994 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001995 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001996 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001997 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00001998 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1999 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2000
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002001class CharmapTest(unittest.TestCase):
2002 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002003 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002004 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002005 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002006 )
2007
Ezio Melottib3aedd42010-11-20 19:04:17 +00002008 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002009 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2010 ("\U0010FFFFbc", 3)
2011 )
2012
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002013 self.assertRaises(UnicodeDecodeError,
2014 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2015 )
2016
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002017 self.assertRaises(UnicodeDecodeError,
2018 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2019 )
2020
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002021 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002022 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002023 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002024 )
2025
Ezio Melottib3aedd42010-11-20 19:04:17 +00002026 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002027 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002028 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002029 )
2030
Ezio Melottib3aedd42010-11-20 19:04:17 +00002031 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002032 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002033 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002034 )
2035
Ezio Melottib3aedd42010-11-20 19:04:17 +00002036 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002037 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002038 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002039 )
2040
Guido van Rossum805365e2007-05-07 22:24:25 +00002041 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002042 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002043 codecs.charmap_decode(allbytes, "ignore", ""),
2044 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002045 )
2046
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002047 def test_decode_with_int2str_map(self):
2048 self.assertEqual(
2049 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2050 {0: 'a', 1: 'b', 2: 'c'}),
2051 ("abc", 3)
2052 )
2053
2054 self.assertEqual(
2055 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2056 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2057 ("AaBbCc", 3)
2058 )
2059
2060 self.assertEqual(
2061 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2062 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2063 ("\U0010FFFFbc", 3)
2064 )
2065
2066 self.assertEqual(
2067 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2068 {0: 'a', 1: 'b', 2: ''}),
2069 ("ab", 3)
2070 )
2071
2072 self.assertRaises(UnicodeDecodeError,
2073 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2074 {0: 'a', 1: 'b'}
2075 )
2076
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002077 self.assertRaises(UnicodeDecodeError,
2078 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2079 {0: 'a', 1: 'b', 2: None}
2080 )
2081
2082 # Issue #14850
2083 self.assertRaises(UnicodeDecodeError,
2084 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2085 {0: 'a', 1: 'b', 2: '\ufffe'}
2086 )
2087
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002088 self.assertEqual(
2089 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2090 {0: 'a', 1: 'b'}),
2091 ("ab\ufffd", 3)
2092 )
2093
2094 self.assertEqual(
2095 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2096 {0: 'a', 1: 'b', 2: None}),
2097 ("ab\ufffd", 3)
2098 )
2099
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002100 # Issue #14850
2101 self.assertEqual(
2102 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2103 {0: 'a', 1: 'b', 2: '\ufffe'}),
2104 ("ab\ufffd", 3)
2105 )
2106
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002107 self.assertEqual(
2108 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2109 {0: 'a', 1: 'b'}),
2110 ("ab", 3)
2111 )
2112
2113 self.assertEqual(
2114 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2115 {0: 'a', 1: 'b', 2: None}),
2116 ("ab", 3)
2117 )
2118
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002119 # Issue #14850
2120 self.assertEqual(
2121 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2122 {0: 'a', 1: 'b', 2: '\ufffe'}),
2123 ("ab", 3)
2124 )
2125
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002126 allbytes = bytes(range(256))
2127 self.assertEqual(
2128 codecs.charmap_decode(allbytes, "ignore", {}),
2129 ("", len(allbytes))
2130 )
2131
2132 def test_decode_with_int2int_map(self):
2133 a = ord('a')
2134 b = ord('b')
2135 c = ord('c')
2136
2137 self.assertEqual(
2138 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2139 {0: a, 1: b, 2: c}),
2140 ("abc", 3)
2141 )
2142
2143 # Issue #15379
2144 self.assertEqual(
2145 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2146 {0: 0x10FFFF, 1: b, 2: c}),
2147 ("\U0010FFFFbc", 3)
2148 )
2149
Antoine Pitroua1f76552012-09-23 20:00:04 +02002150 self.assertEqual(
2151 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2152 {0: sys.maxunicode, 1: b, 2: c}),
2153 (chr(sys.maxunicode) + "bc", 3)
2154 )
2155
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002156 self.assertRaises(TypeError,
2157 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002158 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002159 )
2160
2161 self.assertRaises(UnicodeDecodeError,
2162 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2163 {0: a, 1: b},
2164 )
2165
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002166 self.assertRaises(UnicodeDecodeError,
2167 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2168 {0: a, 1: b, 2: 0xFFFE},
2169 )
2170
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002171 self.assertEqual(
2172 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2173 {0: a, 1: b}),
2174 ("ab\ufffd", 3)
2175 )
2176
2177 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002178 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2179 {0: a, 1: b, 2: 0xFFFE}),
2180 ("ab\ufffd", 3)
2181 )
2182
2183 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002184 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2185 {0: a, 1: b}),
2186 ("ab", 3)
2187 )
2188
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002189 self.assertEqual(
2190 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2191 {0: a, 1: b, 2: 0xFFFE}),
2192 ("ab", 3)
2193 )
2194
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002195
Thomas Wouters89f507f2006-12-13 04:49:30 +00002196class WithStmtTest(unittest.TestCase):
2197 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002198 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002199 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2200 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002201 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002202
2203 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002204 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002205 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002206 with codecs.StreamReaderWriter(f, info.streamreader,
2207 info.streamwriter, 'strict') as srw:
2208 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002209
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002210class TypesTest(unittest.TestCase):
2211 def test_decode_unicode(self):
2212 # Most decoders don't accept unicode input
2213 decoders = [
2214 codecs.utf_7_decode,
2215 codecs.utf_8_decode,
2216 codecs.utf_16_le_decode,
2217 codecs.utf_16_be_decode,
2218 codecs.utf_16_ex_decode,
2219 codecs.utf_32_decode,
2220 codecs.utf_32_le_decode,
2221 codecs.utf_32_be_decode,
2222 codecs.utf_32_ex_decode,
2223 codecs.latin_1_decode,
2224 codecs.ascii_decode,
2225 codecs.charmap_decode,
2226 ]
2227 if hasattr(codecs, "mbcs_decode"):
2228 decoders.append(codecs.mbcs_decode)
2229 for decoder in decoders:
2230 self.assertRaises(TypeError, decoder, "xxx")
2231
2232 def test_unicode_escape(self):
2233 # Escape-decoding an unicode string is supported ang gives the same
2234 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002235 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2236 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2237 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2238 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002239
Victor Stinnere3b47152011-12-09 20:49:49 +01002240 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2241 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2242
2243 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2244 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2245
Serhiy Storchakad6793772013-01-29 10:20:44 +02002246
2247class UnicodeEscapeTest(unittest.TestCase):
2248 def test_empty(self):
2249 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2250 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2251
2252 def test_raw_encode(self):
2253 encode = codecs.unicode_escape_encode
2254 for b in range(32, 127):
2255 if b != b'\\'[0]:
2256 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2257
2258 def test_raw_decode(self):
2259 decode = codecs.unicode_escape_decode
2260 for b in range(256):
2261 if b != b'\\'[0]:
2262 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2263
2264 def test_escape_encode(self):
2265 encode = codecs.unicode_escape_encode
2266 check = coding_checker(self, encode)
2267 check('\t', br'\t')
2268 check('\n', br'\n')
2269 check('\r', br'\r')
2270 check('\\', br'\\')
2271 for b in range(32):
2272 if chr(b) not in '\t\n\r':
2273 check(chr(b), ('\\x%02x' % b).encode())
2274 for b in range(127, 256):
2275 check(chr(b), ('\\x%02x' % b).encode())
2276 check('\u20ac', br'\u20ac')
2277 check('\U0001d120', br'\U0001d120')
2278
2279 def test_escape_decode(self):
2280 decode = codecs.unicode_escape_decode
2281 check = coding_checker(self, decode)
2282 check(b"[\\\n]", "[]")
2283 check(br'[\"]', '["]')
2284 check(br"[\']", "[']")
2285 check(br"[\\]", r"[\]")
2286 check(br"[\a]", "[\x07]")
2287 check(br"[\b]", "[\x08]")
2288 check(br"[\t]", "[\x09]")
2289 check(br"[\n]", "[\x0a]")
2290 check(br"[\v]", "[\x0b]")
2291 check(br"[\f]", "[\x0c]")
2292 check(br"[\r]", "[\x0d]")
2293 check(br"[\7]", "[\x07]")
2294 check(br"[\8]", r"[\8]")
2295 check(br"[\78]", "[\x078]")
2296 check(br"[\41]", "[!]")
2297 check(br"[\418]", "[!8]")
2298 check(br"[\101]", "[A]")
2299 check(br"[\1010]", "[A0]")
2300 check(br"[\x41]", "[A]")
2301 check(br"[\x410]", "[A0]")
2302 check(br"\u20ac", "\u20ac")
2303 check(br"\U0001d120", "\U0001d120")
2304 for b in range(256):
2305 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2306 check(b'\\' + bytes([b]), '\\' + chr(b))
2307
2308 def test_decode_errors(self):
2309 decode = codecs.unicode_escape_decode
2310 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2311 for i in range(d):
2312 self.assertRaises(UnicodeDecodeError, decode,
2313 b"\\" + c + b"0"*i)
2314 self.assertRaises(UnicodeDecodeError, decode,
2315 b"[\\" + c + b"0"*i + b"]")
2316 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2317 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2318 self.assertEqual(decode(data, "replace"),
2319 ("[\ufffd]\ufffd", len(data)))
2320 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2321 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2322 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2323
2324
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002325class RawUnicodeEscapeTest(unittest.TestCase):
2326 def test_empty(self):
2327 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2328 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2329
2330 def test_raw_encode(self):
2331 encode = codecs.raw_unicode_escape_encode
2332 for b in range(256):
2333 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2334
2335 def test_raw_decode(self):
2336 decode = codecs.raw_unicode_escape_decode
2337 for b in range(256):
2338 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2339
2340 def test_escape_encode(self):
2341 encode = codecs.raw_unicode_escape_encode
2342 check = coding_checker(self, encode)
2343 for b in range(256):
2344 if b not in b'uU':
2345 check('\\' + chr(b), b'\\' + bytes([b]))
2346 check('\u20ac', br'\u20ac')
2347 check('\U0001d120', br'\U0001d120')
2348
2349 def test_escape_decode(self):
2350 decode = codecs.raw_unicode_escape_decode
2351 check = coding_checker(self, decode)
2352 for b in range(256):
2353 if b not in b'uU':
2354 check(b'\\' + bytes([b]), '\\' + chr(b))
2355 check(br"\u20ac", "\u20ac")
2356 check(br"\U0001d120", "\U0001d120")
2357
2358 def test_decode_errors(self):
2359 decode = codecs.raw_unicode_escape_decode
2360 for c, d in (b'u', 4), (b'U', 4):
2361 for i in range(d):
2362 self.assertRaises(UnicodeDecodeError, decode,
2363 b"\\" + c + b"0"*i)
2364 self.assertRaises(UnicodeDecodeError, decode,
2365 b"[\\" + c + b"0"*i + b"]")
2366 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2367 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2368 self.assertEqual(decode(data, "replace"),
2369 ("[\ufffd]\ufffd", len(data)))
2370 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2371 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2372 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2373
2374
Martin v. Löwis43c57782009-05-10 08:15:24 +00002375class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002376
2377 def test_utf8(self):
2378 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002379 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002380 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002381 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002382 b"foo\x80bar")
2383 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002384 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002385 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002386 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002387 b"\xed\xb0\x80")
2388
2389 def test_ascii(self):
2390 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002391 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002392 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002393 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002394 b"foo\x80bar")
2395
2396 def test_charmap(self):
2397 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002398 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002399 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002400 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002401 b"foo\xa5bar")
2402
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002403 def test_latin1(self):
2404 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002405 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002406 b"\xe4\xeb\xef\xf6\xfc")
2407
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002408
Victor Stinner3fed0872010-05-22 02:16:27 +00002409class BomTest(unittest.TestCase):
2410 def test_seek0(self):
2411 data = "1234567890"
2412 tests = ("utf-16",
2413 "utf-16-le",
2414 "utf-16-be",
2415 "utf-32",
2416 "utf-32-le",
2417 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002418 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002419 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002420 # Check if the BOM is written only once
2421 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002422 f.write(data)
2423 f.write(data)
2424 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002425 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002426 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002427 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002428
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002429 # Check that the BOM is written after a seek(0)
2430 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2431 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002432 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002433 f.seek(0)
2434 f.write(data)
2435 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002436 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002437
2438 # (StreamWriter) Check that the BOM is written after a seek(0)
2439 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002440 f.writer.write(data[0])
2441 self.assertNotEqual(f.writer.tell(), 0)
2442 f.writer.seek(0)
2443 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002444 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002445 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002446
Victor Stinner05010702011-05-27 16:50:40 +02002447 # Check that the BOM is not written after a seek() at a position
2448 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002449 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2450 f.write(data)
2451 f.seek(f.tell())
2452 f.write(data)
2453 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002454 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002455
Victor Stinner05010702011-05-27 16:50:40 +02002456 # (StreamWriter) Check that the BOM is not written after a seek()
2457 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002458 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002459 f.writer.write(data)
2460 f.writer.seek(f.writer.tell())
2461 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002462 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002463 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002464
Victor Stinner3fed0872010-05-22 02:16:27 +00002465
Georg Brandl02524622010-12-02 18:06:51 +00002466bytes_transform_encodings = [
2467 "base64_codec",
2468 "uu_codec",
2469 "quopri_codec",
2470 "hex_codec",
2471]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002472
2473transform_aliases = {
2474 "base64_codec": ["base64", "base_64"],
2475 "uu_codec": ["uu"],
2476 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2477 "hex_codec": ["hex"],
2478 "rot_13": ["rot13"],
2479}
2480
Georg Brandl02524622010-12-02 18:06:51 +00002481try:
2482 import zlib
2483except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002484 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002485else:
2486 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002487 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002488try:
2489 import bz2
2490except ImportError:
2491 pass
2492else:
2493 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002494 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002495
2496class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002497
Georg Brandl02524622010-12-02 18:06:51 +00002498 def test_basics(self):
2499 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002500 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002501 with self.subTest(encoding=encoding):
2502 # generic codecs interface
2503 (o, size) = codecs.getencoder(encoding)(binput)
2504 self.assertEqual(size, len(binput))
2505 (i, size) = codecs.getdecoder(encoding)(o)
2506 self.assertEqual(size, len(o))
2507 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002508
Georg Brandl02524622010-12-02 18:06:51 +00002509 def test_read(self):
2510 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002511 with self.subTest(encoding=encoding):
2512 sin = codecs.encode(b"\x80", encoding)
2513 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2514 sout = reader.read()
2515 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002516
2517 def test_readline(self):
2518 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002519 with self.subTest(encoding=encoding):
2520 sin = codecs.encode(b"\x80", encoding)
2521 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2522 sout = reader.readline()
2523 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002524
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002525 def test_buffer_api_usage(self):
2526 # We check all the transform codecs accept memoryview input
2527 # for encoding and decoding
2528 # and also that they roundtrip correctly
2529 original = b"12345\x80"
2530 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002531 with self.subTest(encoding=encoding):
2532 data = original
2533 view = memoryview(data)
2534 data = codecs.encode(data, encoding)
2535 view_encoded = codecs.encode(view, encoding)
2536 self.assertEqual(view_encoded, data)
2537 view = memoryview(data)
2538 data = codecs.decode(data, encoding)
2539 self.assertEqual(data, original)
2540 view_decoded = codecs.decode(view, encoding)
2541 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002542
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002543 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002544 # Check binary -> binary codecs give a good error for str input
2545 bad_input = "bad input type"
2546 for encoding in bytes_transform_encodings:
2547 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002548 fmt = ( "{!r} is not a text encoding; "
2549 "use codecs.encode\(\) to handle arbitrary codecs")
2550 msg = fmt.format(encoding)
2551 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002552 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002553 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002554
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002555 def test_text_to_binary_blacklists_text_transforms(self):
2556 # Check str.encode gives a good error message for str -> str codecs
2557 msg = (r"^'rot_13' is not a text encoding; "
2558 "use codecs.encode\(\) to handle arbitrary codecs")
2559 with self.assertRaisesRegex(LookupError, msg):
2560 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002561
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002562 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002563 # Check bytes.decode and bytearray.decode give a good error
2564 # message for binary -> binary codecs
2565 data = b"encode first to ensure we meet any format restrictions"
2566 for encoding in bytes_transform_encodings:
2567 with self.subTest(encoding=encoding):
2568 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002569 fmt = (r"{!r} is not a text encoding; "
2570 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002571 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002572 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002573 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002574 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002575 bytearray(encoded_data).decode(encoding)
2576
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002577 def test_binary_to_text_blacklists_text_transforms(self):
2578 # Check str -> str codec gives a good error for binary input
2579 for bad_input in (b"immutable", bytearray(b"mutable")):
2580 with self.subTest(bad_input=bad_input):
2581 msg = (r"^'rot_13' is not a text encoding; "
2582 "use codecs.decode\(\) to handle arbitrary codecs")
2583 with self.assertRaisesRegex(LookupError, msg) as failure:
2584 bad_input.decode("rot_13")
2585 self.assertIsNone(failure.exception.__cause__)
2586
Zachary Wareefa2e042013-12-30 14:54:11 -06002587 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002588 def test_custom_zlib_error_is_wrapped(self):
2589 # Check zlib codec gives a good error for malformed input
2590 msg = "^decoding with 'zlib_codec' codec failed"
2591 with self.assertRaisesRegex(Exception, msg) as failure:
2592 codecs.decode(b"hello", "zlib_codec")
2593 self.assertIsInstance(failure.exception.__cause__,
2594 type(failure.exception))
2595
2596 def test_custom_hex_error_is_wrapped(self):
2597 # Check hex codec gives a good error for malformed input
2598 msg = "^decoding with 'hex_codec' codec failed"
2599 with self.assertRaisesRegex(Exception, msg) as failure:
2600 codecs.decode(b"hello", "hex_codec")
2601 self.assertIsInstance(failure.exception.__cause__,
2602 type(failure.exception))
2603
2604 # Unfortunately, the bz2 module throws OSError, which the codec
2605 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002606
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002607 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2608 def test_aliases(self):
2609 for codec_name, aliases in transform_aliases.items():
2610 expected_name = codecs.lookup(codec_name).name
2611 for alias in aliases:
2612 with self.subTest(alias=alias):
2613 info = codecs.lookup(alias)
2614 self.assertEqual(info.name, expected_name)
2615
Martin Panter06171bd2015-09-12 00:34:28 +00002616 def test_quopri_stateless(self):
2617 # Should encode with quotetabs=True
2618 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2619 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2620 # But should still support unescaped tabs and spaces
2621 unescaped = b"space tab eol\n"
2622 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2623
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002624 def test_uu_invalid(self):
2625 # Missing "begin" line
2626 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2627
Nick Coghlan8b097b42013-11-13 23:49:21 +10002628
2629# The codec system tries to wrap exceptions in order to ensure the error
2630# mentions the operation being performed and the codec involved. We
2631# currently *only* want this to happen for relatively stateless
2632# exceptions, where the only significant information they contain is their
2633# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002634
2635# Use a local codec registry to avoid appearing to leak objects when
2636# registering multiple seach functions
2637_TEST_CODECS = {}
2638
2639def _get_test_codec(codec_name):
2640 return _TEST_CODECS.get(codec_name)
2641codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2642
Nick Coghlan8fad1672014-09-15 23:50:44 +12002643try:
2644 # Issue #22166: Also need to clear the internal cache in CPython
2645 from _codecs import _forget_codec
2646except ImportError:
2647 def _forget_codec(codec_name):
2648 pass
2649
2650
Nick Coghlan8b097b42013-11-13 23:49:21 +10002651class ExceptionChainingTest(unittest.TestCase):
2652
2653 def setUp(self):
2654 # There's no way to unregister a codec search function, so we just
2655 # ensure we render this one fairly harmless after the test
2656 # case finishes by using the test case repr as the codec name
2657 # The codecs module normalizes codec names, although this doesn't
2658 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002659 # We also make sure we use a truly unique id for the custom codec
2660 # to avoid issues with the codec cache when running these tests
2661 # multiple times (e.g. when hunting for refleaks)
2662 unique_id = repr(self) + str(id(self))
2663 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2664
2665 # We store the object to raise on the instance because of a bad
2666 # interaction between the codec caching (which means we can't
2667 # recreate the codec entry) and regrtest refleak hunting (which
2668 # runs the same test instance multiple times). This means we
2669 # need to ensure the codecs call back in to the instance to find
2670 # out which exception to raise rather than binding them in a
2671 # closure to an object that may change on the next run
2672 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002673
Nick Coghlan4e553e22013-11-16 00:35:34 +10002674 def tearDown(self):
2675 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002676 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2677 encodings._cache.pop(self.codec_name, None)
2678 try:
2679 _forget_codec(self.codec_name)
2680 except KeyError:
2681 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002682
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002683 def set_codec(self, encode, decode):
2684 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002685 name=self.codec_name)
2686 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002687
2688 @contextlib.contextmanager
2689 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002690 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002691 operation, self.codec_name, exc_type.__name__, msg)
2692 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2693 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002694 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002695 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002696
2697 def raise_obj(self, *args, **kwds):
2698 # Helper to dynamically change the object raised by a test codec
2699 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002700
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002701 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002702 self.obj_to_raise = obj_to_raise
2703 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002704 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002705 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002706 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002707 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002708 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002709 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002710 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002711 codecs.decode(b"bytes input", self.codec_name)
2712
2713 def test_raise_by_type(self):
2714 self.check_wrapped(RuntimeError, "")
2715
2716 def test_raise_by_value(self):
2717 msg = "This should be wrapped"
2718 self.check_wrapped(RuntimeError(msg), msg)
2719
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002720 def test_raise_grandchild_subclass_exact_size(self):
2721 msg = "This should be wrapped"
2722 class MyRuntimeError(RuntimeError):
2723 __slots__ = ()
2724 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2725
2726 def test_raise_subclass_with_weakref_support(self):
2727 msg = "This should be wrapped"
2728 class MyRuntimeError(RuntimeError):
2729 pass
2730 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2731
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002732 def check_not_wrapped(self, obj_to_raise, msg):
2733 def raise_obj(*args, **kwds):
2734 raise obj_to_raise
2735 self.set_codec(raise_obj, raise_obj)
2736 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002737 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002738 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002739 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002740 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002741 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002742 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002743 codecs.decode(b"bytes input", self.codec_name)
2744
2745 def test_init_override_is_not_wrapped(self):
2746 class CustomInit(RuntimeError):
2747 def __init__(self):
2748 pass
2749 self.check_not_wrapped(CustomInit, "")
2750
2751 def test_new_override_is_not_wrapped(self):
2752 class CustomNew(RuntimeError):
2753 def __new__(cls):
2754 return super().__new__(cls)
2755 self.check_not_wrapped(CustomNew, "")
2756
2757 def test_instance_attribute_is_not_wrapped(self):
2758 msg = "This should NOT be wrapped"
2759 exc = RuntimeError(msg)
2760 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002761 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002762
2763 def test_non_str_arg_is_not_wrapped(self):
2764 self.check_not_wrapped(RuntimeError(1), "1")
2765
2766 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002767 msg_re = r"^\('a', 'b', 'c'\)$"
2768 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002769
2770 # http://bugs.python.org/issue19609
2771 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002772 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002773 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002774 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002775 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002776 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002777 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002778 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002779 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002780 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002781 codecs.decode(b"bytes input", self.codec_name)
2782
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 def test_unflagged_non_text_codec_handling(self):
2784 # The stdlib non-text codecs are now marked so they're
2785 # pre-emptively skipped by the text model related methods
2786 # However, third party codecs won't be flagged, so we still make
2787 # sure the case where an inappropriate output type is produced is
2788 # handled appropriately
2789 def encode_to_str(*args, **kwds):
2790 return "not bytes!", 0
2791 def decode_to_bytes(*args, **kwds):
2792 return b"not str!", 0
2793 self.set_codec(encode_to_str, decode_to_bytes)
2794 # No input or output type checks on the codecs module functions
2795 encoded = codecs.encode(None, self.codec_name)
2796 self.assertEqual(encoded, "not bytes!")
2797 decoded = codecs.decode(None, self.codec_name)
2798 self.assertEqual(decoded, b"not str!")
2799 # Text model methods should complain
2800 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2801 "use codecs.encode\(\) to encode to arbitrary types$")
2802 msg = fmt.format(self.codec_name)
2803 with self.assertRaisesRegex(TypeError, msg):
2804 "str_input".encode(self.codec_name)
2805 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2806 "use codecs.decode\(\) to decode to arbitrary types$")
2807 msg = fmt.format(self.codec_name)
2808 with self.assertRaisesRegex(TypeError, msg):
2809 b"bytes input".decode(self.codec_name)
2810
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002811
Georg Brandl02524622010-12-02 18:06:51 +00002812
Victor Stinner62be4fb2011-10-18 21:46:37 +02002813@unittest.skipUnless(sys.platform == 'win32',
2814 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002815class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002816 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002817 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002818
Victor Stinner3a50e702011-10-18 21:21:00 +02002819 def test_invalid_code_page(self):
2820 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2821 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002822 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2823 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002824
2825 def test_code_page_name(self):
2826 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2827 codecs.code_page_encode, 932, '\xff')
2828 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2829 codecs.code_page_decode, 932, b'\x81\x00')
2830 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2831 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2832
2833 def check_decode(self, cp, tests):
2834 for raw, errors, expected in tests:
2835 if expected is not None:
2836 try:
2837 decoded = codecs.code_page_decode(cp, raw, errors)
2838 except UnicodeDecodeError as err:
2839 self.fail('Unable to decode %a from "cp%s" with '
2840 'errors=%r: %s' % (raw, cp, errors, err))
2841 self.assertEqual(decoded[0], expected,
2842 '%a.decode("cp%s", %r)=%a != %a'
2843 % (raw, cp, errors, decoded[0], expected))
2844 # assert 0 <= decoded[1] <= len(raw)
2845 self.assertGreaterEqual(decoded[1], 0)
2846 self.assertLessEqual(decoded[1], len(raw))
2847 else:
2848 self.assertRaises(UnicodeDecodeError,
2849 codecs.code_page_decode, cp, raw, errors)
2850
2851 def check_encode(self, cp, tests):
2852 for text, errors, expected in tests:
2853 if expected is not None:
2854 try:
2855 encoded = codecs.code_page_encode(cp, text, errors)
2856 except UnicodeEncodeError as err:
2857 self.fail('Unable to encode %a to "cp%s" with '
2858 'errors=%r: %s' % (text, cp, errors, err))
2859 self.assertEqual(encoded[0], expected,
2860 '%a.encode("cp%s", %r)=%a != %a'
2861 % (text, cp, errors, encoded[0], expected))
2862 self.assertEqual(encoded[1], len(text))
2863 else:
2864 self.assertRaises(UnicodeEncodeError,
2865 codecs.code_page_encode, cp, text, errors)
2866
2867 def test_cp932(self):
2868 self.check_encode(932, (
2869 ('abc', 'strict', b'abc'),
2870 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002871 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002872 ('\xff', 'strict', None),
2873 ('[\xff]', 'ignore', b'[]'),
2874 ('[\xff]', 'replace', b'[y]'),
2875 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002876 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2877 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002878 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002879 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002880 (b'abc', 'strict', 'abc'),
2881 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2882 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002883 (b'[\xff]', 'strict', None),
2884 (b'[\xff]', 'ignore', '[]'),
2885 (b'[\xff]', 'replace', '[\ufffd]'),
2886 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002887 (b'\x81\x00abc', 'strict', None),
2888 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002889 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2890 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002891
2892 def test_cp1252(self):
2893 self.check_encode(1252, (
2894 ('abc', 'strict', b'abc'),
2895 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2896 ('\xff', 'strict', b'\xff'),
2897 ('\u0141', 'strict', None),
2898 ('\u0141', 'ignore', b''),
2899 ('\u0141', 'replace', b'L'),
2900 ))
2901 self.check_decode(1252, (
2902 (b'abc', 'strict', 'abc'),
2903 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2904 (b'\xff', 'strict', '\xff'),
2905 ))
2906
2907 def test_cp_utf7(self):
2908 cp = 65000
2909 self.check_encode(cp, (
2910 ('abc', 'strict', b'abc'),
2911 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2912 ('\U0010ffff', 'strict', b'+2//f/w-'),
2913 ('\udc80', 'strict', b'+3IA-'),
2914 ('\ufffd', 'strict', b'+//0-'),
2915 ))
2916 self.check_decode(cp, (
2917 (b'abc', 'strict', 'abc'),
2918 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2919 (b'+2//f/w-', 'strict', '\U0010ffff'),
2920 (b'+3IA-', 'strict', '\udc80'),
2921 (b'+//0-', 'strict', '\ufffd'),
2922 # invalid bytes
2923 (b'[+/]', 'strict', '[]'),
2924 (b'[\xff]', 'strict', '[\xff]'),
2925 ))
2926
Victor Stinner3a50e702011-10-18 21:21:00 +02002927 def test_multibyte_encoding(self):
2928 self.check_decode(932, (
2929 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2930 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2931 ))
2932 self.check_decode(self.CP_UTF8, (
2933 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2934 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2935 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002936 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002937 self.check_encode(self.CP_UTF8, (
2938 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2939 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2940 ))
2941
2942 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002943 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2944 self.assertEqual(decoded, ('', 0))
2945
Victor Stinner3a50e702011-10-18 21:21:00 +02002946 decoded = codecs.code_page_decode(932,
2947 b'\xe9\x80\xe9', 'strict',
2948 False)
2949 self.assertEqual(decoded, ('\u9a3e', 2))
2950
2951 decoded = codecs.code_page_decode(932,
2952 b'\xe9\x80\xe9\x80', 'strict',
2953 False)
2954 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2955
2956 decoded = codecs.code_page_decode(932,
2957 b'abc', 'strict',
2958 False)
2959 self.assertEqual(decoded, ('abc', 3))
2960
2961
Fred Drake2e2be372001-09-20 21:33:42 +00002962if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002963 unittest.main()