blob: a1079a1f1a01311d23056cac3b70ab7cdc3c098e [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
352 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
353 "[&#56448;]".encode(self.encoding))
354 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
355 "[]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
357 "[?]".encode(self.encoding))
358
359 bom = "".encode(self.encoding)
360 for before, after in [("\U00010fff", "A"), ("[", "]"),
361 ("A", "\U00010fff")]:
362 before_sequence = before.encode(self.encoding)[len(bom):]
363 after_sequence = after.encode(self.encoding)[len(bom):]
364 test_string = before + "\uDC80" + after
365 test_sequence = (bom + before_sequence +
366 self.ill_formed_sequence + after_sequence)
367 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
368 self.encoding)
369 self.assertEqual(test_string.encode(self.encoding,
370 "surrogatepass"),
371 test_sequence)
372 self.assertEqual(test_sequence.decode(self.encoding,
373 "surrogatepass"),
374 test_string)
375 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
376 before + after)
377 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
378 before + self.ill_formed_sequence_replace + after)
379
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200380class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000381 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200382 if sys.byteorder == 'little':
383 ill_formed_sequence = b"\x80\xdc\x00\x00"
384 else:
385 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000386
387 spamle = (b'\xff\xfe\x00\x00'
388 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
389 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
390 spambe = (b'\x00\x00\xfe\xff'
391 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
392 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
393
394 def test_only_one_bom(self):
395 _,_,reader,writer = codecs.lookup(self.encoding)
396 # encode some stream
397 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200398 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 f.write("spam")
400 f.write("spam")
401 d = s.getvalue()
402 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000403 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 # try to read it back
405 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200406 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000407 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000408
409 def test_badbom(self):
410 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000412 self.assertRaises(UnicodeError, f.read)
413
414 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200415 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416 self.assertRaises(UnicodeError, f.read)
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "", # first byte of BOM read
423 "", # second byte of BOM read
424 "", # third byte of BOM read
425 "", # fourth byte of BOM read => byteorder known
426 "",
427 "",
428 "",
429 "\x00",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00\xff",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff\u0100",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff",
443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 ]
447 )
448
Georg Brandl791f4e12009-09-17 11:41:24 +0000449 def test_handlers(self):
450 self.assertEqual(('\ufffd', 1),
451 codecs.utf_32_decode(b'\x01', 'replace', True))
452 self.assertEqual(('', 1),
453 codecs.utf_32_decode(b'\x01', 'ignore', True))
454
Walter Dörwald41980ca2007-08-16 21:55:45 +0000455 def test_errors(self):
456 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
457 b"\xff", "strict", True)
458
459 def test_decoder_state(self):
460 self.check_state_handling_decode(self.encoding,
461 "spamspam", self.spamle)
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spambe)
464
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000465 def test_issue8941(self):
466 # Issue #8941: insufficient result allocation when decoding into
467 # surrogate pairs on UCS-2 builds.
468 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
469 self.assertEqual('\U00010000' * 1024,
470 codecs.utf_32_decode(encoded_le)[0])
471 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
472 self.assertEqual('\U00010000' * 1024,
473 codecs.utf_32_decode(encoded_be)[0])
474
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200475class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200477 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478
479 def test_partial(self):
480 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200481 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 [
483 "",
484 "",
485 "",
486 "\x00",
487 "\x00",
488 "\x00",
489 "\x00",
490 "\x00\xff",
491 "\x00\xff",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff\u0100",
495 "\x00\xff\u0100",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200499 "\x00\xff\u0100\uffff",
500 "\x00\xff\u0100\uffff",
501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000503 ]
504 )
505
506 def test_simple(self):
507 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
508
509 def test_errors(self):
510 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
511 b"\xff", "strict", True)
512
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000513 def test_issue8941(self):
514 # Issue #8941: insufficient result allocation when decoding into
515 # surrogate pairs on UCS-2 builds.
516 encoded = b'\x00\x00\x01\x00' * 1024
517 self.assertEqual('\U00010000' * 1024,
518 codecs.utf_32_le_decode(encoded)[0])
519
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200520class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200522 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523
524 def test_partial(self):
525 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 [
528 "",
529 "",
530 "",
531 "\x00",
532 "\x00",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff\u0100",
540 "\x00\xff\u0100",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200544 "\x00\xff\u0100\uffff",
545 "\x00\xff\u0100\uffff",
546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000548 ]
549 )
550
551 def test_simple(self):
552 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
553
554 def test_errors(self):
555 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
556 b"\xff", "strict", True)
557
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000558 def test_issue8941(self):
559 # Issue #8941: insufficient result allocation when decoding into
560 # surrogate pairs on UCS-2 builds.
561 encoded = b'\x00\x01\x00\x00' * 1024
562 self.assertEqual('\U00010000' * 1024,
563 codecs.utf_32_be_decode(encoded)[0])
564
565
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200566class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000567 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200568 if sys.byteorder == 'little':
569 ill_formed_sequence = b"\x80\xdc"
570 else:
571 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000572
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000573 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
574 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000575
576 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000579 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200580 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000581 f.write("spam")
582 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000583 d = s.getvalue()
584 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000585 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000586 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000587 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200588 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000589 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000591 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000592 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200593 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000594 self.assertRaises(UnicodeError, f.read)
595
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000596 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200597 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000598 self.assertRaises(UnicodeError, f.read)
599
Walter Dörwald69652032004-09-07 20:24:22 +0000600 def test_partial(self):
601 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200602 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000603 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000604 "", # first byte of BOM read
605 "", # second byte of BOM read => byteorder known
606 "",
607 "\x00",
608 "\x00",
609 "\x00\xff",
610 "\x00\xff",
611 "\x00\xff\u0100",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000618 ]
619 )
620
Georg Brandl791f4e12009-09-17 11:41:24 +0000621 def test_handlers(self):
622 self.assertEqual(('\ufffd', 1),
623 codecs.utf_16_decode(b'\x01', 'replace', True))
624 self.assertEqual(('', 1),
625 codecs.utf_16_decode(b'\x01', 'ignore', True))
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000628 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000629 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630
631 def test_decoder_state(self):
632 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000637 def test_bug691291(self):
638 # Files are always opened in binary mode, even if no binary mode was
639 # specified. This means that no automatic conversion of '\n' is done
640 # on reading and writing.
641 s1 = 'Hello\r\nworld\r\n'
642
643 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200644 self.addCleanup(support.unlink, support.TESTFN)
645 with open(support.TESTFN, 'wb') as fp:
646 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200647 with support.check_warnings(('', DeprecationWarning)):
648 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
649 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000651
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200652class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000653 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000655
656 def test_partial(self):
657 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200658 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000659 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "",
661 "\x00",
662 "\x00",
663 "\x00\xff",
664 "\x00\xff",
665 "\x00\xff\u0100",
666 "\x00\xff\u0100",
667 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200668 "\x00\xff\u0100\uffff",
669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000672 ]
673 )
674
Walter Dörwalde22d3392005-11-17 08:52:34 +0000675 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200676 tests = [
677 (b'\xff', '\ufffd'),
678 (b'A\x00Z', 'A\ufffd'),
679 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
680 (b'\x00\xd8', '\ufffd'),
681 (b'\x00\xd8A', '\ufffd'),
682 (b'\x00\xd8A\x00', '\ufffdA'),
683 (b'\x00\xdcA\x00', '\ufffdA'),
684 ]
685 for raw, expected in tests:
686 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
687 raw, 'strict', True)
688 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000689
Victor Stinner53a9dd72010-12-08 22:25:45 +0000690 def test_nonbmp(self):
691 self.assertEqual("\U00010203".encode(self.encoding),
692 b'\x00\xd8\x03\xde')
693 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
694 "\U00010203")
695
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200696class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000697 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200698 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000699
700 def test_partial(self):
701 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200702 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000703 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000704 "",
705 "\x00",
706 "\x00",
707 "\x00\xff",
708 "\x00\xff",
709 "\x00\xff\u0100",
710 "\x00\xff\u0100",
711 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200712 "\x00\xff\u0100\uffff",
713 "\x00\xff\u0100\uffff",
714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000716 ]
717 )
718
Walter Dörwalde22d3392005-11-17 08:52:34 +0000719 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200720 tests = [
721 (b'\xff', '\ufffd'),
722 (b'\x00A\xff', 'A\ufffd'),
723 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
724 (b'\xd8\x00', '\ufffd'),
725 (b'\xd8\x00\xdc', '\ufffd'),
726 (b'\xd8\x00\x00A', '\ufffdA'),
727 (b'\xdc\x00\x00A', '\ufffdA'),
728 ]
729 for raw, expected in tests:
730 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
731 raw, 'strict', True)
732 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000733
Victor Stinner53a9dd72010-12-08 22:25:45 +0000734 def test_nonbmp(self):
735 self.assertEqual("\U00010203".encode(self.encoding),
736 b'\xd8\x00\xde\x03')
737 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
738 "\U00010203")
739
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200740class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000741 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200742 ill_formed_sequence = b"\xed\xb2\x80"
743 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000744
745 def test_partial(self):
746 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200747 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000749 "\x00",
750 "\x00",
751 "\x00\xff",
752 "\x00\xff",
753 "\x00\xff\u07ff",
754 "\x00\xff\u07ff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff\u0800",
757 "\x00\xff\u07ff\u0800",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200760 "\x00\xff\u07ff\u0800\uffff",
761 "\x00\xff\u07ff\u0800\uffff",
762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000764 ]
765 )
766
Walter Dörwald3abcb012007-04-16 22:10:50 +0000767 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 self.check_state_handling_decode(self.encoding,
770 u, u.encode(self.encoding))
771
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000772 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200773 super().test_lone_surrogates()
774 # not sure if this is making sense for
775 # UTF-16 and UTF-32
776 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000777 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000779 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000780 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
781 b"abc\xed\xa0\x80def")
782 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
783 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
785 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
786 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
787 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000788 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700789 with self.assertRaises(UnicodeDecodeError):
790 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000793
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200794@unittest.skipUnless(sys.platform == 'win32',
795 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200796class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200797 encoding = "cp65001"
798
799 def test_encode(self):
800 tests = [
801 ('abc', 'strict', b'abc'),
802 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
803 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
804 ]
805 if VISTA_OR_LATER:
806 tests.extend((
807 ('\udc80', 'strict', None),
808 ('\udc80', 'ignore', b''),
809 ('\udc80', 'replace', b'?'),
810 ('\udc80', 'backslashreplace', b'\\udc80'),
811 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
812 ))
813 else:
814 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
815 for text, errors, expected in tests:
816 if expected is not None:
817 try:
818 encoded = text.encode('cp65001', errors)
819 except UnicodeEncodeError as err:
820 self.fail('Unable to encode %a to cp65001 with '
821 'errors=%r: %s' % (text, errors, err))
822 self.assertEqual(encoded, expected,
823 '%a.encode("cp65001", %r)=%a != %a'
824 % (text, errors, encoded, expected))
825 else:
826 self.assertRaises(UnicodeEncodeError,
827 text.encode, "cp65001", errors)
828
829 def test_decode(self):
830 tests = [
831 (b'abc', 'strict', 'abc'),
832 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
833 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
834 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
835 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
836 # invalid bytes
837 (b'[\xff]', 'strict', None),
838 (b'[\xff]', 'ignore', '[]'),
839 (b'[\xff]', 'replace', '[\ufffd]'),
840 (b'[\xff]', 'surrogateescape', '[\udcff]'),
841 ]
842 if VISTA_OR_LATER:
843 tests.extend((
844 (b'[\xed\xb2\x80]', 'strict', None),
845 (b'[\xed\xb2\x80]', 'ignore', '[]'),
846 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
847 ))
848 else:
849 tests.extend((
850 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
851 ))
852 for raw, errors, expected in tests:
853 if expected is not None:
854 try:
855 decoded = raw.decode('cp65001', errors)
856 except UnicodeDecodeError as err:
857 self.fail('Unable to decode %a from cp65001 with '
858 'errors=%r: %s' % (raw, errors, err))
859 self.assertEqual(decoded, expected,
860 '%a.decode("cp65001", %r)=%a != %a'
861 % (raw, errors, decoded, expected))
862 else:
863 self.assertRaises(UnicodeDecodeError,
864 raw.decode, 'cp65001', errors)
865
866 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
867 def test_lone_surrogates(self):
868 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
869 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
870 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
871 b'[\\udc80]')
872 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
873 b'[&#56448;]')
874 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
875 b'[\x80]')
876 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
877 b'[]')
878 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
879 b'[?]')
880
881 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
882 def test_surrogatepass_handler(self):
883 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
884 b"abc\xed\xa0\x80def")
885 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
886 "abc\ud800def")
887 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
888 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
889 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
890 "\U00010fff\uD800")
891 self.assertTrue(codecs.lookup_error("surrogatepass"))
892
Victor Stinner3633ce32014-02-09 13:11:53 +0100893 def test_readline(self):
894 self.skipTest("issue #20571: code page 65001 codec does not "
895 "support partial decoder yet")
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200896
897
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200898class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000899 encoding = "utf-7"
900
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300901 def test_ascii(self):
902 # Set D (directly encoded characters)
903 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
904 'abcdefghijklmnopqrstuvwxyz'
905 '0123456789'
906 '\'(),-./:?')
907 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
908 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
909 # Set O (optional direct characters)
910 set_o = ' !"#$%&*;<=>@[]^_`{|}'
911 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
912 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
913 # +
914 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
915 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
916 # White spaces
917 ws = ' \t\n\r'
918 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
919 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
920 # Other ASCII characters
921 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
922 set(set_d + set_o + '+' + ws)))
923 self.assertEqual(other_ascii.encode(self.encoding),
924 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
925 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
926
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000927 def test_partial(self):
928 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200929 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000930 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200931 'a',
932 'a',
933 'a+',
934 'a+-',
935 'a+-b',
936 'a+-b',
937 'a+-b',
938 'a+-b',
939 'a+-b',
940 'a+-b\x00',
941 'a+-b\x00c',
942 'a+-b\x00c',
943 'a+-b\x00c',
944 'a+-b\x00c',
945 'a+-b\x00c',
946 'a+-b\x00c\x80',
947 'a+-b\x00c\x80d',
948 'a+-b\x00c\x80d',
949 'a+-b\x00c\x80d',
950 'a+-b\x00c\x80d',
951 'a+-b\x00c\x80d',
952 'a+-b\x00c\x80d\u0100',
953 'a+-b\x00c\x80d\u0100e',
954 'a+-b\x00c\x80d\u0100e',
955 'a+-b\x00c\x80d\u0100e',
956 'a+-b\x00c\x80d\u0100e',
957 'a+-b\x00c\x80d\u0100e',
958 'a+-b\x00c\x80d\u0100e',
959 'a+-b\x00c\x80d\u0100e',
960 'a+-b\x00c\x80d\u0100e',
961 'a+-b\x00c\x80d\u0100e\U00010000',
962 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000963 ]
964 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000965
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300966 def test_errors(self):
967 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300968 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300969 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300970 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300971 (b'a+IK', 'a\ufffd'),
972 (b'a+IK-b', 'a\ufffdb'),
973 (b'a+IK,b', 'a\ufffdb'),
974 (b'a+IKx', 'a\u20ac\ufffd'),
975 (b'a+IKx-b', 'a\u20ac\ufffdb'),
976 (b'a+IKwgr', 'a\u20ac\ufffd'),
977 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
978 (b'a+IKwgr,', 'a\u20ac\ufffd'),
979 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
980 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
981 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
982 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
983 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
984 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
985 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300986 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
987 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300988 ]
989 for raw, expected in tests:
990 with self.subTest(raw=raw):
991 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
992 raw, 'strict', True)
993 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
994
995 def test_nonbmp(self):
996 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
997 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
998 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300999 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1000 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1001 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1002 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1003 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1004 b'+IKwgrNgB3KA-')
1005 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1006 '\u20ac\u20ac\U000104A0')
1007 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1008 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001009
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001010 def test_lone_surrogates(self):
1011 tests = [
1012 (b'a+2AE-b', 'a\ud801b'),
1013 (b'a+2AE\xffb', 'a\ufffdb'),
1014 (b'a+2AE', 'a\ufffd'),
1015 (b'a+2AEA-b', 'a\ufffdb'),
1016 (b'a+2AH-b', 'a\ufffdb'),
1017 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1018 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1019 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1020 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1021 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1022 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1023 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1024 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1025 ]
1026 for raw, expected in tests:
1027 with self.subTest(raw=raw):
1028 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001029
1030
Walter Dörwalde22d3392005-11-17 08:52:34 +00001031class UTF16ExTest(unittest.TestCase):
1032
1033 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001034 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001035
1036 def test_bad_args(self):
1037 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1038
1039class ReadBufferTest(unittest.TestCase):
1040
1041 def test_array(self):
1042 import array
1043 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001044 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001045 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001046 )
1047
1048 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001049 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001050
1051 def test_bad_args(self):
1052 self.assertRaises(TypeError, codecs.readbuffer_encode)
1053 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1054
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001055class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001056 encoding = "utf-8-sig"
1057
1058 def test_partial(self):
1059 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001060 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001061 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001062 "",
1063 "",
1064 "", # First BOM has been read and skipped
1065 "",
1066 "",
1067 "\ufeff", # Second BOM has been read and emitted
1068 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001069 "\ufeff\x00", # First byte of encoded "\xff" read
1070 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1071 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1072 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001073 "\ufeff\x00\xff\u07ff",
1074 "\ufeff\x00\xff\u07ff",
1075 "\ufeff\x00\xff\u07ff\u0800",
1076 "\ufeff\x00\xff\u07ff\u0800",
1077 "\ufeff\x00\xff\u07ff\u0800",
1078 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001079 "\ufeff\x00\xff\u07ff\u0800\uffff",
1080 "\ufeff\x00\xff\u07ff\u0800\uffff",
1081 "\ufeff\x00\xff\u07ff\u0800\uffff",
1082 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001083 ]
1084 )
1085
Thomas Wouters89f507f2006-12-13 04:49:30 +00001086 def test_bug1601501(self):
1087 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001088 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001089
Walter Dörwald3abcb012007-04-16 22:10:50 +00001090 def test_bom(self):
1091 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001092 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001093 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1094
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001095 def test_stream_bom(self):
1096 unistring = "ABC\u00A1\u2200XYZ"
1097 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1098
1099 reader = codecs.getreader("utf-8-sig")
1100 for sizehint in [None] + list(range(1, 11)) + \
1101 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001102 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001103 ostream = io.StringIO()
1104 while 1:
1105 if sizehint is not None:
1106 data = istream.read(sizehint)
1107 else:
1108 data = istream.read()
1109
1110 if not data:
1111 break
1112 ostream.write(data)
1113
1114 got = ostream.getvalue()
1115 self.assertEqual(got, unistring)
1116
1117 def test_stream_bare(self):
1118 unistring = "ABC\u00A1\u2200XYZ"
1119 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1120
1121 reader = codecs.getreader("utf-8-sig")
1122 for sizehint in [None] + list(range(1, 11)) + \
1123 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001124 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001125 ostream = io.StringIO()
1126 while 1:
1127 if sizehint is not None:
1128 data = istream.read(sizehint)
1129 else:
1130 data = istream.read()
1131
1132 if not data:
1133 break
1134 ostream.write(data)
1135
1136 got = ostream.getvalue()
1137 self.assertEqual(got, unistring)
1138
1139class EscapeDecodeTest(unittest.TestCase):
1140 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001141 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001142
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001143 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001144 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001145 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001146 b = bytes([b])
1147 if b != b'\\':
1148 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001149
1150 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001151 decode = codecs.escape_decode
1152 check = coding_checker(self, decode)
1153 check(b"[\\\n]", b"[]")
1154 check(br'[\"]', b'["]')
1155 check(br"[\']", b"[']")
1156 check(br"[\\]", br"[\]")
1157 check(br"[\a]", b"[\x07]")
1158 check(br"[\b]", b"[\x08]")
1159 check(br"[\t]", b"[\x09]")
1160 check(br"[\n]", b"[\x0a]")
1161 check(br"[\v]", b"[\x0b]")
1162 check(br"[\f]", b"[\x0c]")
1163 check(br"[\r]", b"[\x0d]")
1164 check(br"[\7]", b"[\x07]")
1165 check(br"[\8]", br"[\8]")
1166 check(br"[\78]", b"[\x078]")
1167 check(br"[\41]", b"[!]")
1168 check(br"[\418]", b"[!8]")
1169 check(br"[\101]", b"[A]")
1170 check(br"[\1010]", b"[A0]")
1171 check(br"[\501]", b"[A]")
1172 check(br"[\x41]", b"[A]")
1173 check(br"[\X41]", br"[\X41]")
1174 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001175 for b in range(256):
1176 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001177 b = bytes([b])
1178 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001179
1180 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001181 decode = codecs.escape_decode
1182 self.assertRaises(ValueError, decode, br"\x")
1183 self.assertRaises(ValueError, decode, br"[\x]")
1184 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1185 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1186 self.assertRaises(ValueError, decode, br"\x0")
1187 self.assertRaises(ValueError, decode, br"[\x0]")
1188 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1189 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001190
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001191class RecodingTest(unittest.TestCase):
1192 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001193 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001194 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001195 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001196 f2.close()
1197 # Python used to crash on this at exit because of a refcount
1198 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001199
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001200 self.assertTrue(f.closed)
1201
Martin v. Löwis2548c732003-04-18 10:39:54 +00001202# From RFC 3492
1203punycode_testcases = [
1204 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001205 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1206 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001207 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001210 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001211 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001212 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001213 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001215 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1216 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1217 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001218 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1221 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1222 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001223 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001226 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1227 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1228 "\u0939\u0948\u0902",
1229 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230
1231 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001232 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001233 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1234 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235
1236 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1238 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1239 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001240 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1241 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242
1243 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001244 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1245 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1246 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1247 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001248 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249
1250 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001251 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1252 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1253 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1254 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1255 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001256 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257
1258 # (K) Vietnamese:
1259 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1260 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001261 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1262 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1263 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1264 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001265 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266
Martin v. Löwis2548c732003-04-18 10:39:54 +00001267 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001268 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001269 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001270
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001272 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1273 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1274 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001275 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276
1277 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001278 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1279 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1280 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001281 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001282
1283 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001285 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286
1287 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001288 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1289 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001290 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291
1292 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001293 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001294 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001295
1296 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001297 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001298 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299
1300 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001301 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1302 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001303 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304 ]
1305
1306for i in punycode_testcases:
1307 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001308 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309
1310class PunycodeTest(unittest.TestCase):
1311 def test_encode(self):
1312 for uni, puny in punycode_testcases:
1313 # Need to convert both strings to lower case, since
1314 # some of the extended encodings use upper case, but our
1315 # code produces only lower case. Converting just puny to
1316 # lower is also insufficient, since some of the input characters
1317 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001318 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001319 str(uni.encode("punycode"), "ascii").lower(),
1320 str(puny, "ascii").lower()
1321 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001322
1323 def test_decode(self):
1324 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001325 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001326 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001327 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001329class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001330 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001331 def test_bug1251300(self):
1332 # Decoding with unicode_internal used to not correctly handle "code
1333 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001334 ok = [
1335 (b"\x00\x10\xff\xff", "\U0010ffff"),
1336 (b"\x00\x00\x01\x01", "\U00000101"),
1337 (b"", ""),
1338 ]
1339 not_ok = [
1340 b"\x7f\xff\xff\xff",
1341 b"\x80\x00\x00\x00",
1342 b"\x81\x00\x00\x00",
1343 b"\x00",
1344 b"\x00\x00\x00\x00\x00",
1345 ]
1346 for internal, uni in ok:
1347 if sys.byteorder == "little":
1348 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001349 with support.check_warnings():
1350 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001351 for internal in not_ok:
1352 if sys.byteorder == "little":
1353 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001354 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001355 'deprecated', DeprecationWarning)):
1356 self.assertRaises(UnicodeDecodeError, internal.decode,
1357 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001358 if sys.byteorder == "little":
1359 invalid = b"\x00\x00\x11\x00"
1360 else:
1361 invalid = b"\x00\x11\x00\x00"
1362 with support.check_warnings():
1363 self.assertRaises(UnicodeDecodeError,
1364 invalid.decode, "unicode_internal")
1365 with support.check_warnings():
1366 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1367 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001368
Victor Stinner182d90d2011-09-29 19:53:55 +02001369 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001370 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001371 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001372 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001373 'deprecated', DeprecationWarning)):
1374 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001375 except UnicodeDecodeError as ex:
1376 self.assertEqual("unicode_internal", ex.encoding)
1377 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1378 self.assertEqual(4, ex.start)
1379 self.assertEqual(8, ex.end)
1380 else:
1381 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001382
Victor Stinner182d90d2011-09-29 19:53:55 +02001383 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001384 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001385 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1386 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001387 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001388 'deprecated', DeprecationWarning)):
1389 ab = "ab".encode("unicode_internal").decode()
1390 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1391 "ascii"),
1392 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001393 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001394
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001395 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001396 with support.check_warnings(('unicode_internal codec has been '
1397 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001398 # Issue 3739
1399 encoder = codecs.getencoder("unicode_internal")
1400 self.assertEqual(encoder("a")[1], 1)
1401 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1402
1403 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001404
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1406nameprep_tests = [
1407 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1409 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1410 b'\xb8\x8f\xef\xbb\xbf',
1411 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'CAFE',
1414 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.3 Case folding 8bit U+00DF (german sharp s).
1416 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xc3\x9f',
1418 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001419 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xc4\xb0',
1421 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001422 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\xc5\x83\xcd\xba',
1424 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1426 # XXX: skip this as it fails in UCS-2 mode
1427 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1428 # 'telc\xe2\x88\x95kg\xcf\x83'),
1429 (None, None),
1430 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1432 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xe1\xbe\xb7',
1435 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 # 3.9 Self-reverting case folding U+01F0 and normalization.
1437 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xc7\xb0',
1439 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001440 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xce\x90',
1442 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001443 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xce\xb0',
1445 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001446 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xe1\xba\x96',
1448 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001449 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xe1\xbd\x96',
1451 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001452 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b' ',
1454 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xc2\xa0',
1457 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001458 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xe2\x80\x80',
1463 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001464 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'\xe2\x80\x8b',
1466 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'\xe3\x80\x80',
1469 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'\x10\x7f',
1472 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001473 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 None),
1476 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 None),
1479 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'\xef\xbb\xbf',
1481 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001483 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 None),
1485 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001486 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 None),
1488 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001490 None),
1491 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001493 None),
1494 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001495 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001496 None),
1497 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001498 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 None),
1500 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001501 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001502 None),
1503 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001505 None),
1506 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001507 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001508 None),
1509 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001510 (b'\xcd\x81',
1511 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001512 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001513 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001514 None),
1515 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001516 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001517 None),
1518 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001519 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520 None),
1521 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001522 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001523 None),
1524 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001525 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001526 None),
1527 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001528 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001529 None),
1530 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001531 (b'foo\xef\xb9\xb6bar',
1532 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001533 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001534 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001535 None),
1536 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001537 (b'\xd8\xa71\xd8\xa8',
1538 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001539 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001540 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001541 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001542 # None),
1543 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001544 # 3.44 Larger test (shrinking).
1545 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001546 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1547 b'\xaa\xce\xb0\xe2\x80\x80',
1548 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001549 # 3.45 Larger test (expanding).
1550 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001551 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1552 b'\x80',
1553 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1554 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1555 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001556 ]
1557
1558
1559class NameprepTest(unittest.TestCase):
1560 def test_nameprep(self):
1561 from encodings.idna import nameprep
1562 for pos, (orig, prepped) in enumerate(nameprep_tests):
1563 if orig is None:
1564 # Skipped
1565 continue
1566 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001567 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001568 if prepped is None:
1569 # Input contains prohibited characters
1570 self.assertRaises(UnicodeError, nameprep, orig)
1571 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001572 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001573 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001574 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001575 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001576 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001577
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001578class IDNACodecTest(unittest.TestCase):
1579 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(str(b"python.org", "idna"), "python.org")
1581 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1582 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1583 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001584
1585 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001586 self.assertEqual("python.org".encode("idna"), b"python.org")
1587 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1588 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1589 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001590
Martin v. Löwis8b595142005-08-25 11:03:38 +00001591 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001592 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001593 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001595
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001596 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001597 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001598 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001599 "python.org"
1600 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001601 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001602 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001603 "python.org."
1604 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001605 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001606 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001607 "pyth\xf6n.org."
1608 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001609 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001610 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001611 "pyth\xf6n.org."
1612 )
1613
1614 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001615 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1616 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1617 self.assertEqual(decoder.decode(b"rg"), "")
1618 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001619
1620 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001621 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1622 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1623 self.assertEqual(decoder.decode(b"rg."), "org.")
1624 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001625
1626 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001627 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001628 b"".join(codecs.iterencode("python.org", "idna")),
1629 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001630 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001631 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001632 b"".join(codecs.iterencode("python.org.", "idna")),
1633 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001634 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001636 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1637 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001638 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001639 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001640 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1641 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001642 )
1643
1644 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001645 self.assertEqual(encoder.encode("\xe4x"), b"")
1646 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1647 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001648
1649 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001650 self.assertEqual(encoder.encode("\xe4x"), b"")
1651 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1652 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001654 def test_errors(self):
1655 """Only supports "strict" error handler"""
1656 "python.org".encode("idna", "strict")
1657 b"python.org".decode("idna", "strict")
1658 for errors in ("ignore", "replace", "backslashreplace",
1659 "surrogateescape"):
1660 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1661 self.assertRaises(Exception,
1662 b"python.org".decode, "idna", errors)
1663
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001664class CodecsModuleTest(unittest.TestCase):
1665
1666 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001667 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1668 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001669 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001670 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001671 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001672
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001673 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001674 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1675 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001676 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001677 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001678 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001679 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001680
1681 def test_register(self):
1682 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001683 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001684
1685 def test_lookup(self):
1686 self.assertRaises(TypeError, codecs.lookup)
1687 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001688 self.assertRaises(LookupError, codecs.lookup, " ")
1689
1690 def test_getencoder(self):
1691 self.assertRaises(TypeError, codecs.getencoder)
1692 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1693
1694 def test_getdecoder(self):
1695 self.assertRaises(TypeError, codecs.getdecoder)
1696 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1697
1698 def test_getreader(self):
1699 self.assertRaises(TypeError, codecs.getreader)
1700 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1701
1702 def test_getwriter(self):
1703 self.assertRaises(TypeError, codecs.getwriter)
1704 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001705
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001706 def test_lookup_issue1813(self):
1707 # Issue #1813: under Turkish locales, lookup of some codecs failed
1708 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001709 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001710 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1711 try:
1712 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1713 except locale.Error:
1714 # Unsupported locale on this system
1715 self.skipTest('test needs Turkish locale')
1716 c = codecs.lookup('ASCII')
1717 self.assertEqual(c.name, 'ascii')
1718
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001719 def test_all(self):
1720 api = (
1721 "encode", "decode",
1722 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1723 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1724 "getencoder", "getdecoder", "getincrementalencoder",
1725 "getincrementaldecoder", "getreader", "getwriter",
1726 "register_error", "lookup_error",
1727 "strict_errors", "replace_errors", "ignore_errors",
1728 "xmlcharrefreplace_errors", "backslashreplace_errors",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001729 "open", "EncodedFile",
1730 "iterencode", "iterdecode",
1731 "BOM", "BOM_BE", "BOM_LE",
1732 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1733 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1734 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1735 "StreamReaderWriter", "StreamRecoder",
1736 )
1737 self.assertCountEqual(api, codecs.__all__)
1738 for api in codecs.__all__:
1739 getattr(codecs, api)
1740
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001741 def test_open(self):
1742 self.addCleanup(support.unlink, support.TESTFN)
1743 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1744 with self.subTest(mode), \
1745 codecs.open(support.TESTFN, mode, 'ascii') as file:
1746 self.assertIsInstance(file, codecs.StreamReaderWriter)
1747
1748 def test_undefined(self):
1749 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1750 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1751 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1752 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1753 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1754 self.assertRaises(UnicodeError,
1755 codecs.encode, 'abc', 'undefined', errors)
1756 self.assertRaises(UnicodeError,
1757 codecs.decode, b'abc', 'undefined', errors)
1758
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001759class StreamReaderTest(unittest.TestCase):
1760
1761 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001762 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001763 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001764
1765 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001766 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001767 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001768
Thomas Wouters89f507f2006-12-13 04:49:30 +00001769class EncodedFileTest(unittest.TestCase):
1770
1771 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001772 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001773 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001774 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001775
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001776 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001777 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001778 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001779 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001780
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001781all_unicode_encodings = [
1782 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001783 "big5",
1784 "big5hkscs",
1785 "charmap",
1786 "cp037",
1787 "cp1006",
1788 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001789 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001790 "cp1140",
1791 "cp1250",
1792 "cp1251",
1793 "cp1252",
1794 "cp1253",
1795 "cp1254",
1796 "cp1255",
1797 "cp1256",
1798 "cp1257",
1799 "cp1258",
1800 "cp424",
1801 "cp437",
1802 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001803 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001804 "cp737",
1805 "cp775",
1806 "cp850",
1807 "cp852",
1808 "cp855",
1809 "cp856",
1810 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001811 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001812 "cp860",
1813 "cp861",
1814 "cp862",
1815 "cp863",
1816 "cp864",
1817 "cp865",
1818 "cp866",
1819 "cp869",
1820 "cp874",
1821 "cp875",
1822 "cp932",
1823 "cp949",
1824 "cp950",
1825 "euc_jis_2004",
1826 "euc_jisx0213",
1827 "euc_jp",
1828 "euc_kr",
1829 "gb18030",
1830 "gb2312",
1831 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001832 "hp_roman8",
1833 "hz",
1834 "idna",
1835 "iso2022_jp",
1836 "iso2022_jp_1",
1837 "iso2022_jp_2",
1838 "iso2022_jp_2004",
1839 "iso2022_jp_3",
1840 "iso2022_jp_ext",
1841 "iso2022_kr",
1842 "iso8859_1",
1843 "iso8859_10",
1844 "iso8859_11",
1845 "iso8859_13",
1846 "iso8859_14",
1847 "iso8859_15",
1848 "iso8859_16",
1849 "iso8859_2",
1850 "iso8859_3",
1851 "iso8859_4",
1852 "iso8859_5",
1853 "iso8859_6",
1854 "iso8859_7",
1855 "iso8859_8",
1856 "iso8859_9",
1857 "johab",
1858 "koi8_r",
1859 "koi8_u",
1860 "latin_1",
1861 "mac_cyrillic",
1862 "mac_greek",
1863 "mac_iceland",
1864 "mac_latin2",
1865 "mac_roman",
1866 "mac_turkish",
1867 "palmos",
1868 "ptcp154",
1869 "punycode",
1870 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001871 "shift_jis",
1872 "shift_jis_2004",
1873 "shift_jisx0213",
1874 "tis_620",
1875 "unicode_escape",
1876 "unicode_internal",
1877 "utf_16",
1878 "utf_16_be",
1879 "utf_16_le",
1880 "utf_7",
1881 "utf_8",
1882]
1883
1884if hasattr(codecs, "mbcs_encode"):
1885 all_unicode_encodings.append("mbcs")
1886
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001887# The following encoding is not tested, because it's not supposed
1888# to work:
1889# "undefined"
1890
1891# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001892broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001893 "punycode",
1894 "unicode_internal"
1895]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001896
Walter Dörwald3abcb012007-04-16 22:10:50 +00001897class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001898 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001899 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001900 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001901 name = codecs.lookup(encoding).name
1902 if encoding.endswith("_codec"):
1903 name += "_codec"
1904 elif encoding == "latin_1":
1905 name = "latin_1"
1906 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001907
Ezio Melottiadc417c2011-11-17 12:23:34 +02001908 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001909 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001910 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001911 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001912 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001913 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001914
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001915 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001916 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001917 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001918 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001919 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001920 for c in s:
1921 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001922 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001923 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001924 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001925 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001926 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001927 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001928 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001929 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001930 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001931 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001932
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001933 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001934 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001935 try:
1936 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001937 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001938 pass
1939 else:
1940 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001941 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001942 for c in s:
1943 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001944 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001945 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001946 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001947 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001948 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001949 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001950 self.assertEqual(decodedresult, s,
1951 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001952
1953 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001954 result = "".join(codecs.iterdecode(
1955 codecs.iterencode(s, encoding), encoding))
1956 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001957
1958 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001959 result = "".join(codecs.iterdecode(
1960 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001961 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001962
Victor Stinner554f3f02010-06-16 23:33:54 +00001963 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001964 # check incremental decoder/encoder with errors argument
1965 try:
1966 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001967 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001968 pass
1969 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001970 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001971 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001972 decodedresult = "".join(decoder.decode(bytes([c]))
1973 for c in encodedresult)
1974 self.assertEqual(decodedresult, s,
1975 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001976
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001977 @support.cpython_only
1978 def test_basics_capi(self):
1979 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1980 s = "abc123" # all codecs should be able to encode these
1981 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001982 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001983 # check incremental decoder/encoder (fetched via the C API)
1984 try:
1985 cencoder = codec_incrementalencoder(encoding)
1986 except LookupError: # no IncrementalEncoder
1987 pass
1988 else:
1989 # check C API
1990 encodedresult = b""
1991 for c in s:
1992 encodedresult += cencoder.encode(c)
1993 encodedresult += cencoder.encode("", True)
1994 cdecoder = codec_incrementaldecoder(encoding)
1995 decodedresult = ""
1996 for c in encodedresult:
1997 decodedresult += cdecoder.decode(bytes([c]))
1998 decodedresult += cdecoder.decode(b"", True)
1999 self.assertEqual(decodedresult, s,
2000 "encoding=%r" % encoding)
2001
2002 if encoding not in ("idna", "mbcs"):
2003 # check incremental decoder/encoder with errors argument
2004 try:
2005 cencoder = codec_incrementalencoder(encoding, "ignore")
2006 except LookupError: # no IncrementalEncoder
2007 pass
2008 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002009 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002010 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2011 decodedresult = "".join(cdecoder.decode(bytes([c]))
2012 for c in encodedresult)
2013 self.assertEqual(decodedresult, s,
2014 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002015
Walter Dörwald729c31f2005-03-14 19:06:30 +00002016 def test_seek(self):
2017 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002018 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002019 for encoding in all_unicode_encodings:
2020 if encoding == "idna": # FIXME: See SF bug #1163178
2021 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002022 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002023 continue
Victor Stinner05010702011-05-27 16:50:40 +02002024 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002025 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002026 # Test that calling seek resets the internal codec state and buffers
2027 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002028 data = reader.read()
2029 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002030
Walter Dörwalde22d3392005-11-17 08:52:34 +00002031 def test_bad_decode_args(self):
2032 for encoding in all_unicode_encodings:
2033 decoder = codecs.getdecoder(encoding)
2034 self.assertRaises(TypeError, decoder)
2035 if encoding not in ("idna", "punycode"):
2036 self.assertRaises(TypeError, decoder, 42)
2037
2038 def test_bad_encode_args(self):
2039 for encoding in all_unicode_encodings:
2040 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002041 with support.check_warnings():
2042 # unicode-internal has been deprecated
2043 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002044
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002045 def test_encoding_map_type_initialized(self):
2046 from encodings import cp1140
2047 # This used to crash, we are only verifying there's no crash.
2048 table_type = type(cp1140.encoding_table)
2049 self.assertEqual(table_type, table_type)
2050
Walter Dörwald3abcb012007-04-16 22:10:50 +00002051 def test_decoder_state(self):
2052 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002053 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002054 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002055 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002056 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2057 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2058
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002059class CharmapTest(unittest.TestCase):
2060 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002061 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002062 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002063 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002064 )
2065
Ezio Melottib3aedd42010-11-20 19:04:17 +00002066 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002067 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2068 ("\U0010FFFFbc", 3)
2069 )
2070
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002071 self.assertRaises(UnicodeDecodeError,
2072 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2073 )
2074
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002075 self.assertRaises(UnicodeDecodeError,
2076 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2077 )
2078
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002079 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002080 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002081 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002082 )
2083
Ezio Melottib3aedd42010-11-20 19:04:17 +00002084 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002085 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002086 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002087 )
2088
Ezio Melottib3aedd42010-11-20 19:04:17 +00002089 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002090 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002091 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002092 )
2093
Ezio Melottib3aedd42010-11-20 19:04:17 +00002094 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002095 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002096 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002097 )
2098
Guido van Rossum805365e2007-05-07 22:24:25 +00002099 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002100 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002101 codecs.charmap_decode(allbytes, "ignore", ""),
2102 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002103 )
2104
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002105 def test_decode_with_int2str_map(self):
2106 self.assertEqual(
2107 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2108 {0: 'a', 1: 'b', 2: 'c'}),
2109 ("abc", 3)
2110 )
2111
2112 self.assertEqual(
2113 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2114 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2115 ("AaBbCc", 3)
2116 )
2117
2118 self.assertEqual(
2119 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2120 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2121 ("\U0010FFFFbc", 3)
2122 )
2123
2124 self.assertEqual(
2125 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2126 {0: 'a', 1: 'b', 2: ''}),
2127 ("ab", 3)
2128 )
2129
2130 self.assertRaises(UnicodeDecodeError,
2131 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2132 {0: 'a', 1: 'b'}
2133 )
2134
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002135 self.assertRaises(UnicodeDecodeError,
2136 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2137 {0: 'a', 1: 'b', 2: None}
2138 )
2139
2140 # Issue #14850
2141 self.assertRaises(UnicodeDecodeError,
2142 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2143 {0: 'a', 1: 'b', 2: '\ufffe'}
2144 )
2145
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002146 self.assertEqual(
2147 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2148 {0: 'a', 1: 'b'}),
2149 ("ab\ufffd", 3)
2150 )
2151
2152 self.assertEqual(
2153 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2154 {0: 'a', 1: 'b', 2: None}),
2155 ("ab\ufffd", 3)
2156 )
2157
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002158 # Issue #14850
2159 self.assertEqual(
2160 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2161 {0: 'a', 1: 'b', 2: '\ufffe'}),
2162 ("ab\ufffd", 3)
2163 )
2164
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002165 self.assertEqual(
2166 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2167 {0: 'a', 1: 'b'}),
2168 ("ab", 3)
2169 )
2170
2171 self.assertEqual(
2172 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2173 {0: 'a', 1: 'b', 2: None}),
2174 ("ab", 3)
2175 )
2176
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002177 # Issue #14850
2178 self.assertEqual(
2179 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2180 {0: 'a', 1: 'b', 2: '\ufffe'}),
2181 ("ab", 3)
2182 )
2183
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002184 allbytes = bytes(range(256))
2185 self.assertEqual(
2186 codecs.charmap_decode(allbytes, "ignore", {}),
2187 ("", len(allbytes))
2188 )
2189
2190 def test_decode_with_int2int_map(self):
2191 a = ord('a')
2192 b = ord('b')
2193 c = ord('c')
2194
2195 self.assertEqual(
2196 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2197 {0: a, 1: b, 2: c}),
2198 ("abc", 3)
2199 )
2200
2201 # Issue #15379
2202 self.assertEqual(
2203 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2204 {0: 0x10FFFF, 1: b, 2: c}),
2205 ("\U0010FFFFbc", 3)
2206 )
2207
Antoine Pitroua1f76552012-09-23 20:00:04 +02002208 self.assertEqual(
2209 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2210 {0: sys.maxunicode, 1: b, 2: c}),
2211 (chr(sys.maxunicode) + "bc", 3)
2212 )
2213
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002214 self.assertRaises(TypeError,
2215 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002216 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002217 )
2218
2219 self.assertRaises(UnicodeDecodeError,
2220 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2221 {0: a, 1: b},
2222 )
2223
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002224 self.assertRaises(UnicodeDecodeError,
2225 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2226 {0: a, 1: b, 2: 0xFFFE},
2227 )
2228
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002229 self.assertEqual(
2230 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2231 {0: a, 1: b}),
2232 ("ab\ufffd", 3)
2233 )
2234
2235 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002236 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2237 {0: a, 1: b, 2: 0xFFFE}),
2238 ("ab\ufffd", 3)
2239 )
2240
2241 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002242 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2243 {0: a, 1: b}),
2244 ("ab", 3)
2245 )
2246
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002247 self.assertEqual(
2248 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2249 {0: a, 1: b, 2: 0xFFFE}),
2250 ("ab", 3)
2251 )
2252
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002253
Thomas Wouters89f507f2006-12-13 04:49:30 +00002254class WithStmtTest(unittest.TestCase):
2255 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002256 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002257 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2258 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002259 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002260
2261 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002262 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002263 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002264 with codecs.StreamReaderWriter(f, info.streamreader,
2265 info.streamwriter, 'strict') as srw:
2266 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002267
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002268class TypesTest(unittest.TestCase):
2269 def test_decode_unicode(self):
2270 # Most decoders don't accept unicode input
2271 decoders = [
2272 codecs.utf_7_decode,
2273 codecs.utf_8_decode,
2274 codecs.utf_16_le_decode,
2275 codecs.utf_16_be_decode,
2276 codecs.utf_16_ex_decode,
2277 codecs.utf_32_decode,
2278 codecs.utf_32_le_decode,
2279 codecs.utf_32_be_decode,
2280 codecs.utf_32_ex_decode,
2281 codecs.latin_1_decode,
2282 codecs.ascii_decode,
2283 codecs.charmap_decode,
2284 ]
2285 if hasattr(codecs, "mbcs_decode"):
2286 decoders.append(codecs.mbcs_decode)
2287 for decoder in decoders:
2288 self.assertRaises(TypeError, decoder, "xxx")
2289
2290 def test_unicode_escape(self):
2291 # Escape-decoding an unicode string is supported ang gives the same
2292 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002293 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2294 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2295 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2296 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002297
Victor Stinnere3b47152011-12-09 20:49:49 +01002298 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2299 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2300
2301 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2302 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2303
Serhiy Storchakad6793772013-01-29 10:20:44 +02002304
2305class UnicodeEscapeTest(unittest.TestCase):
2306 def test_empty(self):
2307 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2308 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2309
2310 def test_raw_encode(self):
2311 encode = codecs.unicode_escape_encode
2312 for b in range(32, 127):
2313 if b != b'\\'[0]:
2314 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2315
2316 def test_raw_decode(self):
2317 decode = codecs.unicode_escape_decode
2318 for b in range(256):
2319 if b != b'\\'[0]:
2320 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2321
2322 def test_escape_encode(self):
2323 encode = codecs.unicode_escape_encode
2324 check = coding_checker(self, encode)
2325 check('\t', br'\t')
2326 check('\n', br'\n')
2327 check('\r', br'\r')
2328 check('\\', br'\\')
2329 for b in range(32):
2330 if chr(b) not in '\t\n\r':
2331 check(chr(b), ('\\x%02x' % b).encode())
2332 for b in range(127, 256):
2333 check(chr(b), ('\\x%02x' % b).encode())
2334 check('\u20ac', br'\u20ac')
2335 check('\U0001d120', br'\U0001d120')
2336
2337 def test_escape_decode(self):
2338 decode = codecs.unicode_escape_decode
2339 check = coding_checker(self, decode)
2340 check(b"[\\\n]", "[]")
2341 check(br'[\"]', '["]')
2342 check(br"[\']", "[']")
2343 check(br"[\\]", r"[\]")
2344 check(br"[\a]", "[\x07]")
2345 check(br"[\b]", "[\x08]")
2346 check(br"[\t]", "[\x09]")
2347 check(br"[\n]", "[\x0a]")
2348 check(br"[\v]", "[\x0b]")
2349 check(br"[\f]", "[\x0c]")
2350 check(br"[\r]", "[\x0d]")
2351 check(br"[\7]", "[\x07]")
2352 check(br"[\8]", r"[\8]")
2353 check(br"[\78]", "[\x078]")
2354 check(br"[\41]", "[!]")
2355 check(br"[\418]", "[!8]")
2356 check(br"[\101]", "[A]")
2357 check(br"[\1010]", "[A0]")
2358 check(br"[\x41]", "[A]")
2359 check(br"[\x410]", "[A0]")
2360 check(br"\u20ac", "\u20ac")
2361 check(br"\U0001d120", "\U0001d120")
2362 for b in range(256):
2363 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2364 check(b'\\' + bytes([b]), '\\' + chr(b))
2365
2366 def test_decode_errors(self):
2367 decode = codecs.unicode_escape_decode
2368 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2369 for i in range(d):
2370 self.assertRaises(UnicodeDecodeError, decode,
2371 b"\\" + c + b"0"*i)
2372 self.assertRaises(UnicodeDecodeError, decode,
2373 b"[\\" + c + b"0"*i + b"]")
2374 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2375 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2376 self.assertEqual(decode(data, "replace"),
2377 ("[\ufffd]\ufffd", len(data)))
2378 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2379 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2380 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2381
2382
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002383class RawUnicodeEscapeTest(unittest.TestCase):
2384 def test_empty(self):
2385 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2386 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2387
2388 def test_raw_encode(self):
2389 encode = codecs.raw_unicode_escape_encode
2390 for b in range(256):
2391 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2392
2393 def test_raw_decode(self):
2394 decode = codecs.raw_unicode_escape_decode
2395 for b in range(256):
2396 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2397
2398 def test_escape_encode(self):
2399 encode = codecs.raw_unicode_escape_encode
2400 check = coding_checker(self, encode)
2401 for b in range(256):
2402 if b not in b'uU':
2403 check('\\' + chr(b), b'\\' + bytes([b]))
2404 check('\u20ac', br'\u20ac')
2405 check('\U0001d120', br'\U0001d120')
2406
2407 def test_escape_decode(self):
2408 decode = codecs.raw_unicode_escape_decode
2409 check = coding_checker(self, decode)
2410 for b in range(256):
2411 if b not in b'uU':
2412 check(b'\\' + bytes([b]), '\\' + chr(b))
2413 check(br"\u20ac", "\u20ac")
2414 check(br"\U0001d120", "\U0001d120")
2415
2416 def test_decode_errors(self):
2417 decode = codecs.raw_unicode_escape_decode
2418 for c, d in (b'u', 4), (b'U', 4):
2419 for i in range(d):
2420 self.assertRaises(UnicodeDecodeError, decode,
2421 b"\\" + c + b"0"*i)
2422 self.assertRaises(UnicodeDecodeError, decode,
2423 b"[\\" + c + b"0"*i + b"]")
2424 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2425 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2426 self.assertEqual(decode(data, "replace"),
2427 ("[\ufffd]\ufffd", len(data)))
2428 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2429 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2430 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2431
2432
Martin v. Löwis43c57782009-05-10 08:15:24 +00002433class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002434
2435 def test_utf8(self):
2436 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002437 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002438 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002439 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002440 b"foo\x80bar")
2441 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002442 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002443 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002444 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002445 b"\xed\xb0\x80")
2446
2447 def test_ascii(self):
2448 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002449 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002450 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002451 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002452 b"foo\x80bar")
2453
2454 def test_charmap(self):
2455 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002456 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002457 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002458 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002459 b"foo\xa5bar")
2460
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002461 def test_latin1(self):
2462 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002463 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002464 b"\xe4\xeb\xef\xf6\xfc")
2465
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002466
Victor Stinner3fed0872010-05-22 02:16:27 +00002467class BomTest(unittest.TestCase):
2468 def test_seek0(self):
2469 data = "1234567890"
2470 tests = ("utf-16",
2471 "utf-16-le",
2472 "utf-16-be",
2473 "utf-32",
2474 "utf-32-le",
2475 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002476 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002477 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002478 # Check if the BOM is written only once
2479 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002480 f.write(data)
2481 f.write(data)
2482 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002483 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002484 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002485 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002486
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002487 # Check that the BOM is written after a seek(0)
2488 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2489 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002490 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002491 f.seek(0)
2492 f.write(data)
2493 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002494 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002495
2496 # (StreamWriter) Check that the BOM is written after a seek(0)
2497 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002498 f.writer.write(data[0])
2499 self.assertNotEqual(f.writer.tell(), 0)
2500 f.writer.seek(0)
2501 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002502 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002503 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002504
Victor Stinner05010702011-05-27 16:50:40 +02002505 # Check that the BOM is not written after a seek() at a position
2506 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002507 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2508 f.write(data)
2509 f.seek(f.tell())
2510 f.write(data)
2511 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002512 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002513
Victor Stinner05010702011-05-27 16:50:40 +02002514 # (StreamWriter) Check that the BOM is not written after a seek()
2515 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002516 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002517 f.writer.write(data)
2518 f.writer.seek(f.writer.tell())
2519 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002520 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002521 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002522
Victor Stinner3fed0872010-05-22 02:16:27 +00002523
Georg Brandl02524622010-12-02 18:06:51 +00002524bytes_transform_encodings = [
2525 "base64_codec",
2526 "uu_codec",
2527 "quopri_codec",
2528 "hex_codec",
2529]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002530
2531transform_aliases = {
2532 "base64_codec": ["base64", "base_64"],
2533 "uu_codec": ["uu"],
2534 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2535 "hex_codec": ["hex"],
2536 "rot_13": ["rot13"],
2537}
2538
Georg Brandl02524622010-12-02 18:06:51 +00002539try:
2540 import zlib
2541except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002542 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002543else:
2544 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002545 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002546try:
2547 import bz2
2548except ImportError:
2549 pass
2550else:
2551 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002552 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002553
2554class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002555
Georg Brandl02524622010-12-02 18:06:51 +00002556 def test_basics(self):
2557 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002558 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002559 with self.subTest(encoding=encoding):
2560 # generic codecs interface
2561 (o, size) = codecs.getencoder(encoding)(binput)
2562 self.assertEqual(size, len(binput))
2563 (i, size) = codecs.getdecoder(encoding)(o)
2564 self.assertEqual(size, len(o))
2565 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002566
Georg Brandl02524622010-12-02 18:06:51 +00002567 def test_read(self):
2568 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002569 with self.subTest(encoding=encoding):
2570 sin = codecs.encode(b"\x80", encoding)
2571 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2572 sout = reader.read()
2573 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002574
2575 def test_readline(self):
2576 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002577 with self.subTest(encoding=encoding):
2578 sin = codecs.encode(b"\x80", encoding)
2579 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2580 sout = reader.readline()
2581 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002582
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002583 def test_buffer_api_usage(self):
2584 # We check all the transform codecs accept memoryview input
2585 # for encoding and decoding
2586 # and also that they roundtrip correctly
2587 original = b"12345\x80"
2588 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002589 with self.subTest(encoding=encoding):
2590 data = original
2591 view = memoryview(data)
2592 data = codecs.encode(data, encoding)
2593 view_encoded = codecs.encode(view, encoding)
2594 self.assertEqual(view_encoded, data)
2595 view = memoryview(data)
2596 data = codecs.decode(data, encoding)
2597 self.assertEqual(data, original)
2598 view_decoded = codecs.decode(view, encoding)
2599 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002600
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002601 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002602 # Check binary -> binary codecs give a good error for str input
2603 bad_input = "bad input type"
2604 for encoding in bytes_transform_encodings:
2605 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002606 fmt = ( "{!r} is not a text encoding; "
2607 "use codecs.encode\(\) to handle arbitrary codecs")
2608 msg = fmt.format(encoding)
2609 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002610 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002611 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002612
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002613 def test_text_to_binary_blacklists_text_transforms(self):
2614 # Check str.encode gives a good error message for str -> str codecs
2615 msg = (r"^'rot_13' is not a text encoding; "
2616 "use codecs.encode\(\) to handle arbitrary codecs")
2617 with self.assertRaisesRegex(LookupError, msg):
2618 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002619
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002620 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002621 # Check bytes.decode and bytearray.decode give a good error
2622 # message for binary -> binary codecs
2623 data = b"encode first to ensure we meet any format restrictions"
2624 for encoding in bytes_transform_encodings:
2625 with self.subTest(encoding=encoding):
2626 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002627 fmt = (r"{!r} is not a text encoding; "
2628 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002629 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002630 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002631 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002632 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 bytearray(encoded_data).decode(encoding)
2634
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002635 def test_binary_to_text_blacklists_text_transforms(self):
2636 # Check str -> str codec gives a good error for binary input
2637 for bad_input in (b"immutable", bytearray(b"mutable")):
2638 with self.subTest(bad_input=bad_input):
2639 msg = (r"^'rot_13' is not a text encoding; "
2640 "use codecs.decode\(\) to handle arbitrary codecs")
2641 with self.assertRaisesRegex(LookupError, msg) as failure:
2642 bad_input.decode("rot_13")
2643 self.assertIsNone(failure.exception.__cause__)
2644
Zachary Wareefa2e042013-12-30 14:54:11 -06002645 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002646 def test_custom_zlib_error_is_wrapped(self):
2647 # Check zlib codec gives a good error for malformed input
2648 msg = "^decoding with 'zlib_codec' codec failed"
2649 with self.assertRaisesRegex(Exception, msg) as failure:
2650 codecs.decode(b"hello", "zlib_codec")
2651 self.assertIsInstance(failure.exception.__cause__,
2652 type(failure.exception))
2653
2654 def test_custom_hex_error_is_wrapped(self):
2655 # Check hex codec gives a good error for malformed input
2656 msg = "^decoding with 'hex_codec' codec failed"
2657 with self.assertRaisesRegex(Exception, msg) as failure:
2658 codecs.decode(b"hello", "hex_codec")
2659 self.assertIsInstance(failure.exception.__cause__,
2660 type(failure.exception))
2661
2662 # Unfortunately, the bz2 module throws OSError, which the codec
2663 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002664
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002665 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2666 def test_aliases(self):
2667 for codec_name, aliases in transform_aliases.items():
2668 expected_name = codecs.lookup(codec_name).name
2669 for alias in aliases:
2670 with self.subTest(alias=alias):
2671 info = codecs.lookup(alias)
2672 self.assertEqual(info.name, expected_name)
2673
Martin Panter06171bd2015-09-12 00:34:28 +00002674 def test_quopri_stateless(self):
2675 # Should encode with quotetabs=True
2676 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2677 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2678 # But should still support unescaped tabs and spaces
2679 unescaped = b"space tab eol\n"
2680 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2681
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002682 def test_uu_invalid(self):
2683 # Missing "begin" line
2684 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2685
Nick Coghlan8b097b42013-11-13 23:49:21 +10002686
2687# The codec system tries to wrap exceptions in order to ensure the error
2688# mentions the operation being performed and the codec involved. We
2689# currently *only* want this to happen for relatively stateless
2690# exceptions, where the only significant information they contain is their
2691# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002692
2693# Use a local codec registry to avoid appearing to leak objects when
2694# registering multiple seach functions
2695_TEST_CODECS = {}
2696
2697def _get_test_codec(codec_name):
2698 return _TEST_CODECS.get(codec_name)
2699codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2700
Nick Coghlan8fad1672014-09-15 23:50:44 +12002701try:
2702 # Issue #22166: Also need to clear the internal cache in CPython
2703 from _codecs import _forget_codec
2704except ImportError:
2705 def _forget_codec(codec_name):
2706 pass
2707
2708
Nick Coghlan8b097b42013-11-13 23:49:21 +10002709class ExceptionChainingTest(unittest.TestCase):
2710
2711 def setUp(self):
2712 # There's no way to unregister a codec search function, so we just
2713 # ensure we render this one fairly harmless after the test
2714 # case finishes by using the test case repr as the codec name
2715 # The codecs module normalizes codec names, although this doesn't
2716 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002717 # We also make sure we use a truly unique id for the custom codec
2718 # to avoid issues with the codec cache when running these tests
2719 # multiple times (e.g. when hunting for refleaks)
2720 unique_id = repr(self) + str(id(self))
2721 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2722
2723 # We store the object to raise on the instance because of a bad
2724 # interaction between the codec caching (which means we can't
2725 # recreate the codec entry) and regrtest refleak hunting (which
2726 # runs the same test instance multiple times). This means we
2727 # need to ensure the codecs call back in to the instance to find
2728 # out which exception to raise rather than binding them in a
2729 # closure to an object that may change on the next run
2730 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002731
Nick Coghlan4e553e22013-11-16 00:35:34 +10002732 def tearDown(self):
2733 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002734 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2735 encodings._cache.pop(self.codec_name, None)
2736 try:
2737 _forget_codec(self.codec_name)
2738 except KeyError:
2739 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002740
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002741 def set_codec(self, encode, decode):
2742 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002743 name=self.codec_name)
2744 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002745
2746 @contextlib.contextmanager
2747 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002748 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002749 operation, self.codec_name, exc_type.__name__, msg)
2750 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2751 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002752 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002753 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002754
2755 def raise_obj(self, *args, **kwds):
2756 # Helper to dynamically change the object raised by a test codec
2757 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002758
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002759 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002760 self.obj_to_raise = obj_to_raise
2761 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002762 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002763 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002764 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002765 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002766 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002767 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002768 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002769 codecs.decode(b"bytes input", self.codec_name)
2770
2771 def test_raise_by_type(self):
2772 self.check_wrapped(RuntimeError, "")
2773
2774 def test_raise_by_value(self):
2775 msg = "This should be wrapped"
2776 self.check_wrapped(RuntimeError(msg), msg)
2777
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002778 def test_raise_grandchild_subclass_exact_size(self):
2779 msg = "This should be wrapped"
2780 class MyRuntimeError(RuntimeError):
2781 __slots__ = ()
2782 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2783
2784 def test_raise_subclass_with_weakref_support(self):
2785 msg = "This should be wrapped"
2786 class MyRuntimeError(RuntimeError):
2787 pass
2788 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2789
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002790 def check_not_wrapped(self, obj_to_raise, msg):
2791 def raise_obj(*args, **kwds):
2792 raise obj_to_raise
2793 self.set_codec(raise_obj, raise_obj)
2794 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002795 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002796 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002797 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002798 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002799 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002800 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002801 codecs.decode(b"bytes input", self.codec_name)
2802
2803 def test_init_override_is_not_wrapped(self):
2804 class CustomInit(RuntimeError):
2805 def __init__(self):
2806 pass
2807 self.check_not_wrapped(CustomInit, "")
2808
2809 def test_new_override_is_not_wrapped(self):
2810 class CustomNew(RuntimeError):
2811 def __new__(cls):
2812 return super().__new__(cls)
2813 self.check_not_wrapped(CustomNew, "")
2814
2815 def test_instance_attribute_is_not_wrapped(self):
2816 msg = "This should NOT be wrapped"
2817 exc = RuntimeError(msg)
2818 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002819 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002820
2821 def test_non_str_arg_is_not_wrapped(self):
2822 self.check_not_wrapped(RuntimeError(1), "1")
2823
2824 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002825 msg_re = r"^\('a', 'b', 'c'\)$"
2826 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002827
2828 # http://bugs.python.org/issue19609
2829 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002830 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002831 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002832 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002833 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002834 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002835 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002836 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002837 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002838 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002839 codecs.decode(b"bytes input", self.codec_name)
2840
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002841 def test_unflagged_non_text_codec_handling(self):
2842 # The stdlib non-text codecs are now marked so they're
2843 # pre-emptively skipped by the text model related methods
2844 # However, third party codecs won't be flagged, so we still make
2845 # sure the case where an inappropriate output type is produced is
2846 # handled appropriately
2847 def encode_to_str(*args, **kwds):
2848 return "not bytes!", 0
2849 def decode_to_bytes(*args, **kwds):
2850 return b"not str!", 0
2851 self.set_codec(encode_to_str, decode_to_bytes)
2852 # No input or output type checks on the codecs module functions
2853 encoded = codecs.encode(None, self.codec_name)
2854 self.assertEqual(encoded, "not bytes!")
2855 decoded = codecs.decode(None, self.codec_name)
2856 self.assertEqual(decoded, b"not str!")
2857 # Text model methods should complain
2858 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2859 "use codecs.encode\(\) to encode to arbitrary types$")
2860 msg = fmt.format(self.codec_name)
2861 with self.assertRaisesRegex(TypeError, msg):
2862 "str_input".encode(self.codec_name)
2863 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2864 "use codecs.decode\(\) to decode to arbitrary types$")
2865 msg = fmt.format(self.codec_name)
2866 with self.assertRaisesRegex(TypeError, msg):
2867 b"bytes input".decode(self.codec_name)
2868
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002869
Georg Brandl02524622010-12-02 18:06:51 +00002870
Victor Stinner62be4fb2011-10-18 21:46:37 +02002871@unittest.skipUnless(sys.platform == 'win32',
2872 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002873class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002874 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002875 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002876
Victor Stinner3a50e702011-10-18 21:21:00 +02002877 def test_invalid_code_page(self):
2878 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2879 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002880 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2881 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002882
2883 def test_code_page_name(self):
2884 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2885 codecs.code_page_encode, 932, '\xff')
2886 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2887 codecs.code_page_decode, 932, b'\x81\x00')
2888 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2889 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2890
2891 def check_decode(self, cp, tests):
2892 for raw, errors, expected in tests:
2893 if expected is not None:
2894 try:
2895 decoded = codecs.code_page_decode(cp, raw, errors)
2896 except UnicodeDecodeError as err:
2897 self.fail('Unable to decode %a from "cp%s" with '
2898 'errors=%r: %s' % (raw, cp, errors, err))
2899 self.assertEqual(decoded[0], expected,
2900 '%a.decode("cp%s", %r)=%a != %a'
2901 % (raw, cp, errors, decoded[0], expected))
2902 # assert 0 <= decoded[1] <= len(raw)
2903 self.assertGreaterEqual(decoded[1], 0)
2904 self.assertLessEqual(decoded[1], len(raw))
2905 else:
2906 self.assertRaises(UnicodeDecodeError,
2907 codecs.code_page_decode, cp, raw, errors)
2908
2909 def check_encode(self, cp, tests):
2910 for text, errors, expected in tests:
2911 if expected is not None:
2912 try:
2913 encoded = codecs.code_page_encode(cp, text, errors)
2914 except UnicodeEncodeError as err:
2915 self.fail('Unable to encode %a to "cp%s" with '
2916 'errors=%r: %s' % (text, cp, errors, err))
2917 self.assertEqual(encoded[0], expected,
2918 '%a.encode("cp%s", %r)=%a != %a'
2919 % (text, cp, errors, encoded[0], expected))
2920 self.assertEqual(encoded[1], len(text))
2921 else:
2922 self.assertRaises(UnicodeEncodeError,
2923 codecs.code_page_encode, cp, text, errors)
2924
2925 def test_cp932(self):
2926 self.check_encode(932, (
2927 ('abc', 'strict', b'abc'),
2928 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002929 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002930 ('\xff', 'strict', None),
2931 ('[\xff]', 'ignore', b'[]'),
2932 ('[\xff]', 'replace', b'[y]'),
2933 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002934 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2935 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002936 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002937 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002938 (b'abc', 'strict', 'abc'),
2939 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2940 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002941 (b'[\xff]', 'strict', None),
2942 (b'[\xff]', 'ignore', '[]'),
2943 (b'[\xff]', 'replace', '[\ufffd]'),
2944 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002945 (b'\x81\x00abc', 'strict', None),
2946 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002947 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2948 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002949
2950 def test_cp1252(self):
2951 self.check_encode(1252, (
2952 ('abc', 'strict', b'abc'),
2953 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2954 ('\xff', 'strict', b'\xff'),
2955 ('\u0141', 'strict', None),
2956 ('\u0141', 'ignore', b''),
2957 ('\u0141', 'replace', b'L'),
2958 ))
2959 self.check_decode(1252, (
2960 (b'abc', 'strict', 'abc'),
2961 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2962 (b'\xff', 'strict', '\xff'),
2963 ))
2964
2965 def test_cp_utf7(self):
2966 cp = 65000
2967 self.check_encode(cp, (
2968 ('abc', 'strict', b'abc'),
2969 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2970 ('\U0010ffff', 'strict', b'+2//f/w-'),
2971 ('\udc80', 'strict', b'+3IA-'),
2972 ('\ufffd', 'strict', b'+//0-'),
2973 ))
2974 self.check_decode(cp, (
2975 (b'abc', 'strict', 'abc'),
2976 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2977 (b'+2//f/w-', 'strict', '\U0010ffff'),
2978 (b'+3IA-', 'strict', '\udc80'),
2979 (b'+//0-', 'strict', '\ufffd'),
2980 # invalid bytes
2981 (b'[+/]', 'strict', '[]'),
2982 (b'[\xff]', 'strict', '[\xff]'),
2983 ))
2984
Victor Stinner3a50e702011-10-18 21:21:00 +02002985 def test_multibyte_encoding(self):
2986 self.check_decode(932, (
2987 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2988 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2989 ))
2990 self.check_decode(self.CP_UTF8, (
2991 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2992 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2993 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002994 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002995 self.check_encode(self.CP_UTF8, (
2996 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2997 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2998 ))
2999
3000 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003001 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3002 self.assertEqual(decoded, ('', 0))
3003
Victor Stinner3a50e702011-10-18 21:21:00 +02003004 decoded = codecs.code_page_decode(932,
3005 b'\xe9\x80\xe9', 'strict',
3006 False)
3007 self.assertEqual(decoded, ('\u9a3e', 2))
3008
3009 decoded = codecs.code_page_decode(932,
3010 b'\xe9\x80\xe9\x80', 'strict',
3011 False)
3012 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3013
3014 decoded = codecs.code_page_decode(932,
3015 b'abc', 'strict',
3016 False)
3017 self.assertEqual(decoded, ('abc', 3))
3018
3019
Fred Drake2e2be372001-09-20 21:33:42 +00003020if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003021 unittest.main()