blob: 7fed1f7734ca15f9590938b00a7afd59dfff7ed4 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200352 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
381
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200382class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000383 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200384 if sys.byteorder == 'little':
385 ill_formed_sequence = b"\x80\xdc\x00\x00"
386 else:
387 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000388
389 spamle = (b'\xff\xfe\x00\x00'
390 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
391 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
392 spambe = (b'\x00\x00\xfe\xff'
393 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
394 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
395
396 def test_only_one_bom(self):
397 _,_,reader,writer = codecs.lookup(self.encoding)
398 # encode some stream
399 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200400 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000401 f.write("spam")
402 f.write("spam")
403 d = s.getvalue()
404 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000405 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406 # try to read it back
407 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200408 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000409 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410
411 def test_badbom(self):
412 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200413 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 self.assertRaises(UnicodeError, f.read)
415
416 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 def test_partial(self):
421 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200422 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000423 [
424 "", # first byte of BOM read
425 "", # second byte of BOM read
426 "", # third byte of BOM read
427 "", # fourth byte of BOM read => byteorder known
428 "",
429 "",
430 "",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff",
446 "\x00\xff\u0100\uffff",
447 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000448 ]
449 )
450
Georg Brandl791f4e12009-09-17 11:41:24 +0000451 def test_handlers(self):
452 self.assertEqual(('\ufffd', 1),
453 codecs.utf_32_decode(b'\x01', 'replace', True))
454 self.assertEqual(('', 1),
455 codecs.utf_32_decode(b'\x01', 'ignore', True))
456
Walter Dörwald41980ca2007-08-16 21:55:45 +0000457 def test_errors(self):
458 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
459 b"\xff", "strict", True)
460
461 def test_decoder_state(self):
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spamle)
464 self.check_state_handling_decode(self.encoding,
465 "spamspam", self.spambe)
466
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000467 def test_issue8941(self):
468 # Issue #8941: insufficient result allocation when decoding into
469 # surrogate pairs on UCS-2 builds.
470 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
471 self.assertEqual('\U00010000' * 1024,
472 codecs.utf_32_decode(encoded_le)[0])
473 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
474 self.assertEqual('\U00010000' * 1024,
475 codecs.utf_32_decode(encoded_be)[0])
476
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200477class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200479 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480
481 def test_partial(self):
482 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200483 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484 [
485 "",
486 "",
487 "",
488 "\x00",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000505 ]
506 )
507
508 def test_simple(self):
509 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
510
511 def test_errors(self):
512 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
513 b"\xff", "strict", True)
514
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000515 def test_issue8941(self):
516 # Issue #8941: insufficient result allocation when decoding into
517 # surrogate pairs on UCS-2 builds.
518 encoded = b'\x00\x00\x01\x00' * 1024
519 self.assertEqual('\U00010000' * 1024,
520 codecs.utf_32_le_decode(encoded)[0])
521
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200522class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200524 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000525
526 def test_partial(self):
527 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200528 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529 [
530 "",
531 "",
532 "",
533 "\x00",
534 "\x00",
535 "\x00",
536 "\x00",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff",
540 "\x00\xff",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100",
544 "\x00\xff\u0100",
545 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff",
548 "\x00\xff\u0100\uffff",
549 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000550 ]
551 )
552
553 def test_simple(self):
554 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
555
556 def test_errors(self):
557 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
558 b"\xff", "strict", True)
559
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000560 def test_issue8941(self):
561 # Issue #8941: insufficient result allocation when decoding into
562 # surrogate pairs on UCS-2 builds.
563 encoded = b'\x00\x01\x00\x00' * 1024
564 self.assertEqual('\U00010000' * 1024,
565 codecs.utf_32_be_decode(encoded)[0])
566
567
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200568class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200570 if sys.byteorder == 'little':
571 ill_formed_sequence = b"\x80\xdc"
572 else:
573 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000574
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000575 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
576 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000577
578 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000580 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000581 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200582 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000583 f.write("spam")
584 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000585 d = s.getvalue()
586 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000587 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000588 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000589 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200590 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000591 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000593 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000594 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200595 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000596 self.assertRaises(UnicodeError, f.read)
597
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwald69652032004-09-07 20:24:22 +0000602 def test_partial(self):
603 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200604 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000605 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000606 "", # first byte of BOM read
607 "", # second byte of BOM read => byteorder known
608 "",
609 "\x00",
610 "\x00",
611 "\x00\xff",
612 "\x00\xff",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100",
615 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff",
619 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 ]
621 )
622
Georg Brandl791f4e12009-09-17 11:41:24 +0000623 def test_handlers(self):
624 self.assertEqual(('\ufffd', 1),
625 codecs.utf_16_decode(b'\x01', 'replace', True))
626 self.assertEqual(('', 1),
627 codecs.utf_16_decode(b'\x01', 'ignore', True))
628
Walter Dörwalde22d3392005-11-17 08:52:34 +0000629 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000631 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000632
633 def test_decoder_state(self):
634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000637 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000638
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000639 def test_bug691291(self):
640 # Files are always opened in binary mode, even if no binary mode was
641 # specified. This means that no automatic conversion of '\n' is done
642 # on reading and writing.
643 s1 = 'Hello\r\nworld\r\n'
644
645 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200646 self.addCleanup(support.unlink, support.TESTFN)
647 with open(support.TESTFN, 'wb') as fp:
648 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200649 with support.check_warnings(('', DeprecationWarning)):
650 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
651 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200652 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000653
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200654class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000655 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200656 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000657
658 def test_partial(self):
659 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200660 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "\x00",
664 "\x00",
665 "\x00\xff",
666 "\x00\xff",
667 "\x00\xff\u0100",
668 "\x00\xff\u0100",
669 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff",
672 "\x00\xff\u0100\uffff",
673 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000674 ]
675 )
676
Walter Dörwalde22d3392005-11-17 08:52:34 +0000677 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200678 tests = [
679 (b'\xff', '\ufffd'),
680 (b'A\x00Z', 'A\ufffd'),
681 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
682 (b'\x00\xd8', '\ufffd'),
683 (b'\x00\xd8A', '\ufffd'),
684 (b'\x00\xd8A\x00', '\ufffdA'),
685 (b'\x00\xdcA\x00', '\ufffdA'),
686 ]
687 for raw, expected in tests:
688 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
689 raw, 'strict', True)
690 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000691
Victor Stinner53a9dd72010-12-08 22:25:45 +0000692 def test_nonbmp(self):
693 self.assertEqual("\U00010203".encode(self.encoding),
694 b'\x00\xd8\x03\xde')
695 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
696 "\U00010203")
697
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200698class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000699 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200700 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000701
702 def test_partial(self):
703 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200704 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000705 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 "",
707 "\x00",
708 "\x00",
709 "\x00\xff",
710 "\x00\xff",
711 "\x00\xff\u0100",
712 "\x00\xff\u0100",
713 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff",
716 "\x00\xff\u0100\uffff",
717 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000718 ]
719 )
720
Walter Dörwalde22d3392005-11-17 08:52:34 +0000721 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200722 tests = [
723 (b'\xff', '\ufffd'),
724 (b'\x00A\xff', 'A\ufffd'),
725 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
726 (b'\xd8\x00', '\ufffd'),
727 (b'\xd8\x00\xdc', '\ufffd'),
728 (b'\xd8\x00\x00A', '\ufffdA'),
729 (b'\xdc\x00\x00A', '\ufffdA'),
730 ]
731 for raw, expected in tests:
732 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
733 raw, 'strict', True)
734 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000735
Victor Stinner53a9dd72010-12-08 22:25:45 +0000736 def test_nonbmp(self):
737 self.assertEqual("\U00010203".encode(self.encoding),
738 b'\xd8\x00\xde\x03')
739 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
740 "\U00010203")
741
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200742class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000743 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200744 ill_formed_sequence = b"\xed\xb2\x80"
745 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000746
747 def test_partial(self):
748 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200749 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000750 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000751 "\x00",
752 "\x00",
753 "\x00\xff",
754 "\x00\xff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff",
757 "\x00\xff\u07ff",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800",
760 "\x00\xff\u07ff\u0800",
761 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff",
764 "\x00\xff\u07ff\u0800\uffff",
765 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000766 ]
767 )
768
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000771 self.check_state_handling_decode(self.encoding,
772 u, u.encode(self.encoding))
773
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000774 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200775 super().test_lone_surrogates()
776 # not sure if this is making sense for
777 # UTF-16 and UTF-32
778 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000779 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000780
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000781 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000782 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
783 b"abc\xed\xa0\x80def")
784 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
785 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200786 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
787 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
788 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
789 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000790 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200793 with self.assertRaises(UnicodeDecodeError):
794 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000795
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200796@unittest.skipUnless(sys.platform == 'win32',
797 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200798class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200799 encoding = "cp65001"
800
801 def test_encode(self):
802 tests = [
803 ('abc', 'strict', b'abc'),
804 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
805 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
806 ]
807 if VISTA_OR_LATER:
808 tests.extend((
809 ('\udc80', 'strict', None),
810 ('\udc80', 'ignore', b''),
811 ('\udc80', 'replace', b'?'),
812 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200813 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200814 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
815 ))
816 else:
817 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
818 for text, errors, expected in tests:
819 if expected is not None:
820 try:
821 encoded = text.encode('cp65001', errors)
822 except UnicodeEncodeError as err:
823 self.fail('Unable to encode %a to cp65001 with '
824 'errors=%r: %s' % (text, errors, err))
825 self.assertEqual(encoded, expected,
826 '%a.encode("cp65001", %r)=%a != %a'
827 % (text, errors, encoded, expected))
828 else:
829 self.assertRaises(UnicodeEncodeError,
830 text.encode, "cp65001", errors)
831
832 def test_decode(self):
833 tests = [
834 (b'abc', 'strict', 'abc'),
835 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
836 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
837 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
838 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
839 # invalid bytes
840 (b'[\xff]', 'strict', None),
841 (b'[\xff]', 'ignore', '[]'),
842 (b'[\xff]', 'replace', '[\ufffd]'),
843 (b'[\xff]', 'surrogateescape', '[\udcff]'),
844 ]
845 if VISTA_OR_LATER:
846 tests.extend((
847 (b'[\xed\xb2\x80]', 'strict', None),
848 (b'[\xed\xb2\x80]', 'ignore', '[]'),
849 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
850 ))
851 else:
852 tests.extend((
853 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
854 ))
855 for raw, errors, expected in tests:
856 if expected is not None:
857 try:
858 decoded = raw.decode('cp65001', errors)
859 except UnicodeDecodeError as err:
860 self.fail('Unable to decode %a from cp65001 with '
861 'errors=%r: %s' % (raw, errors, err))
862 self.assertEqual(decoded, expected,
863 '%a.decode("cp65001", %r)=%a != %a'
864 % (raw, errors, decoded, expected))
865 else:
866 self.assertRaises(UnicodeDecodeError,
867 raw.decode, 'cp65001', errors)
868
869 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
870 def test_lone_surrogates(self):
871 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
872 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
873 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
874 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200875 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
876 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200877 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
878 b'[&#56448;]')
879 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
880 b'[\x80]')
881 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
882 b'[]')
883 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
884 b'[?]')
885
886 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
887 def test_surrogatepass_handler(self):
888 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
889 b"abc\xed\xa0\x80def")
890 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
891 "abc\ud800def")
892 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
893 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
894 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
895 "\U00010fff\uD800")
896 self.assertTrue(codecs.lookup_error("surrogatepass"))
897
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200898
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200899class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000900 encoding = "utf-7"
901
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000902 def test_partial(self):
903 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200904 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000905 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200906 'a',
907 'a',
908 'a+',
909 'a+-',
910 'a+-b',
911 'a+-b',
912 'a+-b',
913 'a+-b',
914 'a+-b',
915 'a+-b\x00',
916 'a+-b\x00c',
917 'a+-b\x00c',
918 'a+-b\x00c',
919 'a+-b\x00c',
920 'a+-b\x00c',
921 'a+-b\x00c\x80',
922 'a+-b\x00c\x80d',
923 'a+-b\x00c\x80d',
924 'a+-b\x00c\x80d',
925 'a+-b\x00c\x80d',
926 'a+-b\x00c\x80d',
927 'a+-b\x00c\x80d\u0100',
928 'a+-b\x00c\x80d\u0100e',
929 'a+-b\x00c\x80d\u0100e',
930 'a+-b\x00c\x80d\u0100e',
931 'a+-b\x00c\x80d\u0100e',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e',
936 'a+-b\x00c\x80d\u0100e\U00010000',
937 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000938 ]
939 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000940
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300941 def test_errors(self):
942 tests = [
943 (b'a\xffb', 'a\ufffdb'),
944 (b'a+IK', 'a\ufffd'),
945 (b'a+IK-b', 'a\ufffdb'),
946 (b'a+IK,b', 'a\ufffdb'),
947 (b'a+IKx', 'a\u20ac\ufffd'),
948 (b'a+IKx-b', 'a\u20ac\ufffdb'),
949 (b'a+IKwgr', 'a\u20ac\ufffd'),
950 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
951 (b'a+IKwgr,', 'a\u20ac\ufffd'),
952 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
953 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
954 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
955 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
956 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
957 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
958 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
959 ]
960 for raw, expected in tests:
961 with self.subTest(raw=raw):
962 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
963 raw, 'strict', True)
964 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
965
966 def test_nonbmp(self):
967 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
968 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
969 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
970
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200971 test_lone_surrogates = None
972
973
Walter Dörwalde22d3392005-11-17 08:52:34 +0000974class UTF16ExTest(unittest.TestCase):
975
976 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000977 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000978
979 def test_bad_args(self):
980 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
981
982class ReadBufferTest(unittest.TestCase):
983
984 def test_array(self):
985 import array
986 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000987 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000988 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000989 )
990
991 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000992 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000993
994 def test_bad_args(self):
995 self.assertRaises(TypeError, codecs.readbuffer_encode)
996 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
997
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200998class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000999 encoding = "utf-8-sig"
1000
1001 def test_partial(self):
1002 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001003 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001004 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 "",
1006 "",
1007 "", # First BOM has been read and skipped
1008 "",
1009 "",
1010 "\ufeff", # Second BOM has been read and emitted
1011 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001012 "\ufeff\x00", # First byte of encoded "\xff" read
1013 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1014 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1015 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001016 "\ufeff\x00\xff\u07ff",
1017 "\ufeff\x00\xff\u07ff",
1018 "\ufeff\x00\xff\u07ff\u0800",
1019 "\ufeff\x00\xff\u07ff\u0800",
1020 "\ufeff\x00\xff\u07ff\u0800",
1021 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001022 "\ufeff\x00\xff\u07ff\u0800\uffff",
1023 "\ufeff\x00\xff\u07ff\u0800\uffff",
1024 "\ufeff\x00\xff\u07ff\u0800\uffff",
1025 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001026 ]
1027 )
1028
Thomas Wouters89f507f2006-12-13 04:49:30 +00001029 def test_bug1601501(self):
1030 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001031 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001032
Walter Dörwald3abcb012007-04-16 22:10:50 +00001033 def test_bom(self):
1034 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001035 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001036 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1037
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001038 def test_stream_bom(self):
1039 unistring = "ABC\u00A1\u2200XYZ"
1040 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1041
1042 reader = codecs.getreader("utf-8-sig")
1043 for sizehint in [None] + list(range(1, 11)) + \
1044 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001045 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001046 ostream = io.StringIO()
1047 while 1:
1048 if sizehint is not None:
1049 data = istream.read(sizehint)
1050 else:
1051 data = istream.read()
1052
1053 if not data:
1054 break
1055 ostream.write(data)
1056
1057 got = ostream.getvalue()
1058 self.assertEqual(got, unistring)
1059
1060 def test_stream_bare(self):
1061 unistring = "ABC\u00A1\u2200XYZ"
1062 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1063
1064 reader = codecs.getreader("utf-8-sig")
1065 for sizehint in [None] + list(range(1, 11)) + \
1066 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001067 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001068 ostream = io.StringIO()
1069 while 1:
1070 if sizehint is not None:
1071 data = istream.read(sizehint)
1072 else:
1073 data = istream.read()
1074
1075 if not data:
1076 break
1077 ostream.write(data)
1078
1079 got = ostream.getvalue()
1080 self.assertEqual(got, unistring)
1081
1082class EscapeDecodeTest(unittest.TestCase):
1083 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001084 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001085
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001086 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001087 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001088 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001089 b = bytes([b])
1090 if b != b'\\':
1091 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001092
1093 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001094 decode = codecs.escape_decode
1095 check = coding_checker(self, decode)
1096 check(b"[\\\n]", b"[]")
1097 check(br'[\"]', b'["]')
1098 check(br"[\']", b"[']")
1099 check(br"[\\]", br"[\]")
1100 check(br"[\a]", b"[\x07]")
1101 check(br"[\b]", b"[\x08]")
1102 check(br"[\t]", b"[\x09]")
1103 check(br"[\n]", b"[\x0a]")
1104 check(br"[\v]", b"[\x0b]")
1105 check(br"[\f]", b"[\x0c]")
1106 check(br"[\r]", b"[\x0d]")
1107 check(br"[\7]", b"[\x07]")
1108 check(br"[\8]", br"[\8]")
1109 check(br"[\78]", b"[\x078]")
1110 check(br"[\41]", b"[!]")
1111 check(br"[\418]", b"[!8]")
1112 check(br"[\101]", b"[A]")
1113 check(br"[\1010]", b"[A0]")
1114 check(br"[\501]", b"[A]")
1115 check(br"[\x41]", b"[A]")
1116 check(br"[\X41]", br"[\X41]")
1117 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001118 for b in range(256):
1119 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001120 b = bytes([b])
1121 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001122
1123 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001124 decode = codecs.escape_decode
1125 self.assertRaises(ValueError, decode, br"\x")
1126 self.assertRaises(ValueError, decode, br"[\x]")
1127 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1128 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1129 self.assertRaises(ValueError, decode, br"\x0")
1130 self.assertRaises(ValueError, decode, br"[\x0]")
1131 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1132 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001133
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001134class RecodingTest(unittest.TestCase):
1135 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001136 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001137 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001138 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001139 f2.close()
1140 # Python used to crash on this at exit because of a refcount
1141 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001142
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001143 self.assertTrue(f.closed)
1144
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145# From RFC 3492
1146punycode_testcases = [
1147 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001148 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1149 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001150 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001151 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001153 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001155 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001156 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001158 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1159 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1160 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001161 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001162 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1164 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1165 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001166 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001167 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001169 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1170 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1171 "\u0939\u0948\u0902",
1172 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173
1174 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001175 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001176 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1177 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
1179 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1181 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1182 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001183 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1184 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185
1186 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001187 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1188 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1189 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1190 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001191 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001192
1193 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001194 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1195 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1196 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1197 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1198 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001199 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001200
1201 # (K) Vietnamese:
1202 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1203 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001204 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1205 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1206 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1207 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001208 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001209
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001211 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001212 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001213
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001215 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1216 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1217 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001218 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219
1220 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001221 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1222 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1223 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001224 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001225
1226 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001227 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001228 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229
1230 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1232 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001233 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234
1235 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001236 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001237 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238
1239 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001240 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001241 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242
1243 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001244 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1245 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001246 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247 ]
1248
1249for i in punycode_testcases:
1250 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001251 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252
1253class PunycodeTest(unittest.TestCase):
1254 def test_encode(self):
1255 for uni, puny in punycode_testcases:
1256 # Need to convert both strings to lower case, since
1257 # some of the extended encodings use upper case, but our
1258 # code produces only lower case. Converting just puny to
1259 # lower is also insufficient, since some of the input characters
1260 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001261 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001262 str(uni.encode("punycode"), "ascii").lower(),
1263 str(puny, "ascii").lower()
1264 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265
1266 def test_decode(self):
1267 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001268 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001269 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001270 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001272class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001273 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001274 def test_bug1251300(self):
1275 # Decoding with unicode_internal used to not correctly handle "code
1276 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001277 ok = [
1278 (b"\x00\x10\xff\xff", "\U0010ffff"),
1279 (b"\x00\x00\x01\x01", "\U00000101"),
1280 (b"", ""),
1281 ]
1282 not_ok = [
1283 b"\x7f\xff\xff\xff",
1284 b"\x80\x00\x00\x00",
1285 b"\x81\x00\x00\x00",
1286 b"\x00",
1287 b"\x00\x00\x00\x00\x00",
1288 ]
1289 for internal, uni in ok:
1290 if sys.byteorder == "little":
1291 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001292 with support.check_warnings():
1293 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001294 for internal in not_ok:
1295 if sys.byteorder == "little":
1296 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001297 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001298 'deprecated', DeprecationWarning)):
1299 self.assertRaises(UnicodeDecodeError, internal.decode,
1300 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001301 if sys.byteorder == "little":
1302 invalid = b"\x00\x00\x11\x00"
1303 else:
1304 invalid = b"\x00\x11\x00\x00"
1305 with support.check_warnings():
1306 self.assertRaises(UnicodeDecodeError,
1307 invalid.decode, "unicode_internal")
1308 with support.check_warnings():
1309 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1310 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001311
Victor Stinner182d90d2011-09-29 19:53:55 +02001312 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001313 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001314 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001315 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001316 'deprecated', DeprecationWarning)):
1317 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001318 except UnicodeDecodeError as ex:
1319 self.assertEqual("unicode_internal", ex.encoding)
1320 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1321 self.assertEqual(4, ex.start)
1322 self.assertEqual(8, ex.end)
1323 else:
1324 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001325
Victor Stinner182d90d2011-09-29 19:53:55 +02001326 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001327 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001328 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1329 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001330 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001331 'deprecated', DeprecationWarning)):
1332 ab = "ab".encode("unicode_internal").decode()
1333 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1334 "ascii"),
1335 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001336 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001337
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001338 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001339 with support.check_warnings(('unicode_internal codec has been '
1340 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001341 # Issue 3739
1342 encoder = codecs.getencoder("unicode_internal")
1343 self.assertEqual(encoder("a")[1], 1)
1344 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1345
1346 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001347
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1349nameprep_tests = [
1350 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1352 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1353 b'\xb8\x8f\xef\xbb\xbf',
1354 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'CAFE',
1357 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.3 Case folding 8bit U+00DF (german sharp s).
1359 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xc3\x9f',
1361 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\xc4\xb0',
1364 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'\xc5\x83\xcd\xba',
1367 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1369 # XXX: skip this as it fails in UCS-2 mode
1370 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1371 # 'telc\xe2\x88\x95kg\xcf\x83'),
1372 (None, None),
1373 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1375 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xe1\xbe\xb7',
1378 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.9 Self-reverting case folding U+01F0 and normalization.
1380 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xc7\xb0',
1382 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001383 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xce\x90',
1385 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xce\xb0',
1388 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xe1\xba\x96',
1391 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'\xe1\xbd\x96',
1394 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b' ',
1397 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xc2\xa0',
1400 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 None),
1404 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xe2\x80\x80',
1406 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001408 (b'\xe2\x80\x8b',
1409 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'\xe3\x80\x80',
1412 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'\x10\x7f',
1415 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 None),
1419 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001420 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 None),
1422 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001423 (b'\xef\xbb\xbf',
1424 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001426 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 None),
1428 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 None),
1431 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001432 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 None),
1434 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001435 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 None),
1437 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001438 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 None),
1440 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001441 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442 None),
1443 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001444 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 None),
1446 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001447 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 None),
1449 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001450 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 None),
1452 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 (b'\xcd\x81',
1454 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001456 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 None),
1458 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001459 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 None),
1461 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 None),
1464 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001465 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 None),
1467 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001468 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 None),
1470 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001471 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001472 None),
1473 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001474 (b'foo\xef\xb9\xb6bar',
1475 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 None),
1479 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001480 (b'\xd8\xa71\xd8\xa8',
1481 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001482 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001483 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001484 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001485 # None),
1486 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001487 # 3.44 Larger test (shrinking).
1488 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001489 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1490 b'\xaa\xce\xb0\xe2\x80\x80',
1491 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 # 3.45 Larger test (expanding).
1493 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1495 b'\x80',
1496 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1497 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1498 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001499 ]
1500
1501
1502class NameprepTest(unittest.TestCase):
1503 def test_nameprep(self):
1504 from encodings.idna import nameprep
1505 for pos, (orig, prepped) in enumerate(nameprep_tests):
1506 if orig is None:
1507 # Skipped
1508 continue
1509 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001510 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001511 if prepped is None:
1512 # Input contains prohibited characters
1513 self.assertRaises(UnicodeError, nameprep, orig)
1514 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001515 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001517 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001518 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001519 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001520
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001521class IDNACodecTest(unittest.TestCase):
1522 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(str(b"python.org", "idna"), "python.org")
1524 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1525 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1526 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001527
1528 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001529 self.assertEqual("python.org".encode("idna"), b"python.org")
1530 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1531 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1532 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001533
Martin v. Löwis8b595142005-08-25 11:03:38 +00001534 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001535 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001536 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001538
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001540 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001541 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001542 "python.org"
1543 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001545 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001546 "python.org."
1547 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001548 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001549 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001550 "pyth\xf6n.org."
1551 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001553 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001554 "pyth\xf6n.org."
1555 )
1556
1557 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001558 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1559 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1560 self.assertEqual(decoder.decode(b"rg"), "")
1561 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001562
1563 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001564 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1565 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1566 self.assertEqual(decoder.decode(b"rg."), "org.")
1567 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001568
1569 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001570 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001571 b"".join(codecs.iterencode("python.org", "idna")),
1572 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001573 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001574 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001575 b"".join(codecs.iterencode("python.org.", "idna")),
1576 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001577 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001578 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001579 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1580 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001581 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001582 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001583 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1584 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001585 )
1586
1587 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001588 self.assertEqual(encoder.encode("\xe4x"), b"")
1589 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1590 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591
1592 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001593 self.assertEqual(encoder.encode("\xe4x"), b"")
1594 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1595 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001596
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001597 def test_errors(self):
1598 """Only supports "strict" error handler"""
1599 "python.org".encode("idna", "strict")
1600 b"python.org".decode("idna", "strict")
1601 for errors in ("ignore", "replace", "backslashreplace",
1602 "surrogateescape"):
1603 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1604 self.assertRaises(Exception,
1605 b"python.org".decode, "idna", errors)
1606
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001607class CodecsModuleTest(unittest.TestCase):
1608
1609 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001610 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1611 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001612 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001613 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001614 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001615
Victor Stinnera57dfd02014-05-14 17:13:14 +02001616 # test keywords
1617 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1618 '\xe4\xf6\xfc')
1619 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1620 '[]')
1621
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001622 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001623 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1624 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001625 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001626 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001627 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001628 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001629
Victor Stinnera57dfd02014-05-14 17:13:14 +02001630 # test keywords
1631 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1632 b'\xe4\xf6\xfc')
1633 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1634 b'[]')
1635
Walter Dörwald063e1e82004-10-28 13:04:26 +00001636 def test_register(self):
1637 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001638 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001639
1640 def test_lookup(self):
1641 self.assertRaises(TypeError, codecs.lookup)
1642 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001643 self.assertRaises(LookupError, codecs.lookup, " ")
1644
1645 def test_getencoder(self):
1646 self.assertRaises(TypeError, codecs.getencoder)
1647 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1648
1649 def test_getdecoder(self):
1650 self.assertRaises(TypeError, codecs.getdecoder)
1651 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1652
1653 def test_getreader(self):
1654 self.assertRaises(TypeError, codecs.getreader)
1655 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1656
1657 def test_getwriter(self):
1658 self.assertRaises(TypeError, codecs.getwriter)
1659 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001660
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001661 def test_lookup_issue1813(self):
1662 # Issue #1813: under Turkish locales, lookup of some codecs failed
1663 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001664 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001665 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1666 try:
1667 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1668 except locale.Error:
1669 # Unsupported locale on this system
1670 self.skipTest('test needs Turkish locale')
1671 c = codecs.lookup('ASCII')
1672 self.assertEqual(c.name, 'ascii')
1673
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001674 def test_all(self):
1675 api = (
1676 "encode", "decode",
1677 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1678 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1679 "getencoder", "getdecoder", "getincrementalencoder",
1680 "getincrementaldecoder", "getreader", "getwriter",
1681 "register_error", "lookup_error",
1682 "strict_errors", "replace_errors", "ignore_errors",
1683 "xmlcharrefreplace_errors", "backslashreplace_errors",
1684 "namereplace_errors",
1685 "open", "EncodedFile",
1686 "iterencode", "iterdecode",
1687 "BOM", "BOM_BE", "BOM_LE",
1688 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1689 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1690 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1691 "StreamReaderWriter", "StreamRecoder",
1692 )
1693 self.assertCountEqual(api, codecs.__all__)
1694 for api in codecs.__all__:
1695 getattr(codecs, api)
1696
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001697 def test_open(self):
1698 self.addCleanup(support.unlink, support.TESTFN)
1699 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1700 with self.subTest(mode), \
1701 codecs.open(support.TESTFN, mode, 'ascii') as file:
1702 self.assertIsInstance(file, codecs.StreamReaderWriter)
1703
1704 def test_undefined(self):
1705 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1706 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1707 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1708 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1709 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1710 self.assertRaises(UnicodeError,
1711 codecs.encode, 'abc', 'undefined', errors)
1712 self.assertRaises(UnicodeError,
1713 codecs.decode, b'abc', 'undefined', errors)
1714
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001715class StreamReaderTest(unittest.TestCase):
1716
1717 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001718 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001719 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001720
1721 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001722 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001723 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001724
Thomas Wouters89f507f2006-12-13 04:49:30 +00001725class EncodedFileTest(unittest.TestCase):
1726
1727 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001728 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001729 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001730 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001731
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001732 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001733 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001734 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001735 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001736
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001737all_unicode_encodings = [
1738 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001739 "big5",
1740 "big5hkscs",
1741 "charmap",
1742 "cp037",
1743 "cp1006",
1744 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001745 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001746 "cp1140",
1747 "cp1250",
1748 "cp1251",
1749 "cp1252",
1750 "cp1253",
1751 "cp1254",
1752 "cp1255",
1753 "cp1256",
1754 "cp1257",
1755 "cp1258",
1756 "cp424",
1757 "cp437",
1758 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001759 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001760 "cp737",
1761 "cp775",
1762 "cp850",
1763 "cp852",
1764 "cp855",
1765 "cp856",
1766 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001767 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001768 "cp860",
1769 "cp861",
1770 "cp862",
1771 "cp863",
1772 "cp864",
1773 "cp865",
1774 "cp866",
1775 "cp869",
1776 "cp874",
1777 "cp875",
1778 "cp932",
1779 "cp949",
1780 "cp950",
1781 "euc_jis_2004",
1782 "euc_jisx0213",
1783 "euc_jp",
1784 "euc_kr",
1785 "gb18030",
1786 "gb2312",
1787 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001788 "hp_roman8",
1789 "hz",
1790 "idna",
1791 "iso2022_jp",
1792 "iso2022_jp_1",
1793 "iso2022_jp_2",
1794 "iso2022_jp_2004",
1795 "iso2022_jp_3",
1796 "iso2022_jp_ext",
1797 "iso2022_kr",
1798 "iso8859_1",
1799 "iso8859_10",
1800 "iso8859_11",
1801 "iso8859_13",
1802 "iso8859_14",
1803 "iso8859_15",
1804 "iso8859_16",
1805 "iso8859_2",
1806 "iso8859_3",
1807 "iso8859_4",
1808 "iso8859_5",
1809 "iso8859_6",
1810 "iso8859_7",
1811 "iso8859_8",
1812 "iso8859_9",
1813 "johab",
1814 "koi8_r",
1815 "koi8_u",
1816 "latin_1",
1817 "mac_cyrillic",
1818 "mac_greek",
1819 "mac_iceland",
1820 "mac_latin2",
1821 "mac_roman",
1822 "mac_turkish",
1823 "palmos",
1824 "ptcp154",
1825 "punycode",
1826 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001827 "shift_jis",
1828 "shift_jis_2004",
1829 "shift_jisx0213",
1830 "tis_620",
1831 "unicode_escape",
1832 "unicode_internal",
1833 "utf_16",
1834 "utf_16_be",
1835 "utf_16_le",
1836 "utf_7",
1837 "utf_8",
1838]
1839
1840if hasattr(codecs, "mbcs_encode"):
1841 all_unicode_encodings.append("mbcs")
1842
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001843# The following encoding is not tested, because it's not supposed
1844# to work:
1845# "undefined"
1846
1847# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001848broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001849 "punycode",
1850 "unicode_internal"
1851]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001852
Walter Dörwald3abcb012007-04-16 22:10:50 +00001853class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001854 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001855 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001857 name = codecs.lookup(encoding).name
1858 if encoding.endswith("_codec"):
1859 name += "_codec"
1860 elif encoding == "latin_1":
1861 name = "latin_1"
1862 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001863
Ezio Melottiadc417c2011-11-17 12:23:34 +02001864 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001865 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001866 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001867 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001868 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001869 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001870
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001871 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001872 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001873 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001874 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001875 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001876 for c in s:
1877 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001878 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001879 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001880 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001881 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001882 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001883 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001884 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001885 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001887 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001888
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001889 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001890 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001891 try:
1892 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001893 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001894 pass
1895 else:
1896 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001897 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001898 for c in s:
1899 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001900 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001901 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001902 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001903 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001904 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001905 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001906 self.assertEqual(decodedresult, s,
1907 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001908
1909 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001910 result = "".join(codecs.iterdecode(
1911 codecs.iterencode(s, encoding), encoding))
1912 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001913
1914 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001915 result = "".join(codecs.iterdecode(
1916 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001917 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001918
Victor Stinner554f3f02010-06-16 23:33:54 +00001919 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001920 # check incremental decoder/encoder with errors argument
1921 try:
1922 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001923 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001924 pass
1925 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001926 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001927 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001928 decodedresult = "".join(decoder.decode(bytes([c]))
1929 for c in encodedresult)
1930 self.assertEqual(decodedresult, s,
1931 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001932
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001933 @support.cpython_only
1934 def test_basics_capi(self):
1935 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1936 s = "abc123" # all codecs should be able to encode these
1937 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001938 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001939 # check incremental decoder/encoder (fetched via the C API)
1940 try:
1941 cencoder = codec_incrementalencoder(encoding)
1942 except LookupError: # no IncrementalEncoder
1943 pass
1944 else:
1945 # check C API
1946 encodedresult = b""
1947 for c in s:
1948 encodedresult += cencoder.encode(c)
1949 encodedresult += cencoder.encode("", True)
1950 cdecoder = codec_incrementaldecoder(encoding)
1951 decodedresult = ""
1952 for c in encodedresult:
1953 decodedresult += cdecoder.decode(bytes([c]))
1954 decodedresult += cdecoder.decode(b"", True)
1955 self.assertEqual(decodedresult, s,
1956 "encoding=%r" % encoding)
1957
1958 if encoding not in ("idna", "mbcs"):
1959 # check incremental decoder/encoder with errors argument
1960 try:
1961 cencoder = codec_incrementalencoder(encoding, "ignore")
1962 except LookupError: # no IncrementalEncoder
1963 pass
1964 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001965 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001966 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1967 decodedresult = "".join(cdecoder.decode(bytes([c]))
1968 for c in encodedresult)
1969 self.assertEqual(decodedresult, s,
1970 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001971
Walter Dörwald729c31f2005-03-14 19:06:30 +00001972 def test_seek(self):
1973 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001974 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001975 for encoding in all_unicode_encodings:
1976 if encoding == "idna": # FIXME: See SF bug #1163178
1977 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001978 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00001979 continue
Victor Stinner05010702011-05-27 16:50:40 +02001980 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001981 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001982 # Test that calling seek resets the internal codec state and buffers
1983 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001984 data = reader.read()
1985 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001986
Walter Dörwalde22d3392005-11-17 08:52:34 +00001987 def test_bad_decode_args(self):
1988 for encoding in all_unicode_encodings:
1989 decoder = codecs.getdecoder(encoding)
1990 self.assertRaises(TypeError, decoder)
1991 if encoding not in ("idna", "punycode"):
1992 self.assertRaises(TypeError, decoder, 42)
1993
1994 def test_bad_encode_args(self):
1995 for encoding in all_unicode_encodings:
1996 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001997 with support.check_warnings():
1998 # unicode-internal has been deprecated
1999 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002001 def test_encoding_map_type_initialized(self):
2002 from encodings import cp1140
2003 # This used to crash, we are only verifying there's no crash.
2004 table_type = type(cp1140.encoding_table)
2005 self.assertEqual(table_type, table_type)
2006
Walter Dörwald3abcb012007-04-16 22:10:50 +00002007 def test_decoder_state(self):
2008 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002009 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002010 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002011 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002012 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2013 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2014
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002015class CharmapTest(unittest.TestCase):
2016 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002017 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002018 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002019 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002020 )
2021
Ezio Melottib3aedd42010-11-20 19:04:17 +00002022 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002023 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2024 ("\U0010FFFFbc", 3)
2025 )
2026
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002027 self.assertRaises(UnicodeDecodeError,
2028 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2029 )
2030
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002031 self.assertRaises(UnicodeDecodeError,
2032 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2033 )
2034
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002035 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002036 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002037 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002038 )
2039
Ezio Melottib3aedd42010-11-20 19:04:17 +00002040 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002041 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002042 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002043 )
2044
Ezio Melottib3aedd42010-11-20 19:04:17 +00002045 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002046 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002047 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002048 )
2049
Ezio Melottib3aedd42010-11-20 19:04:17 +00002050 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002051 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002052 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002053 )
2054
Guido van Rossum805365e2007-05-07 22:24:25 +00002055 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002056 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002057 codecs.charmap_decode(allbytes, "ignore", ""),
2058 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002059 )
2060
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002061 def test_decode_with_int2str_map(self):
2062 self.assertEqual(
2063 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2064 {0: 'a', 1: 'b', 2: 'c'}),
2065 ("abc", 3)
2066 )
2067
2068 self.assertEqual(
2069 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2070 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2071 ("AaBbCc", 3)
2072 )
2073
2074 self.assertEqual(
2075 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2076 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2077 ("\U0010FFFFbc", 3)
2078 )
2079
2080 self.assertEqual(
2081 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2082 {0: 'a', 1: 'b', 2: ''}),
2083 ("ab", 3)
2084 )
2085
2086 self.assertRaises(UnicodeDecodeError,
2087 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2088 {0: 'a', 1: 'b'}
2089 )
2090
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002091 self.assertRaises(UnicodeDecodeError,
2092 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2093 {0: 'a', 1: 'b', 2: None}
2094 )
2095
2096 # Issue #14850
2097 self.assertRaises(UnicodeDecodeError,
2098 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2099 {0: 'a', 1: 'b', 2: '\ufffe'}
2100 )
2101
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002102 self.assertEqual(
2103 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2104 {0: 'a', 1: 'b'}),
2105 ("ab\ufffd", 3)
2106 )
2107
2108 self.assertEqual(
2109 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2110 {0: 'a', 1: 'b', 2: None}),
2111 ("ab\ufffd", 3)
2112 )
2113
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002114 # Issue #14850
2115 self.assertEqual(
2116 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2117 {0: 'a', 1: 'b', 2: '\ufffe'}),
2118 ("ab\ufffd", 3)
2119 )
2120
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002121 self.assertEqual(
2122 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2123 {0: 'a', 1: 'b'}),
2124 ("ab", 3)
2125 )
2126
2127 self.assertEqual(
2128 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2129 {0: 'a', 1: 'b', 2: None}),
2130 ("ab", 3)
2131 )
2132
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002133 # Issue #14850
2134 self.assertEqual(
2135 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2136 {0: 'a', 1: 'b', 2: '\ufffe'}),
2137 ("ab", 3)
2138 )
2139
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002140 allbytes = bytes(range(256))
2141 self.assertEqual(
2142 codecs.charmap_decode(allbytes, "ignore", {}),
2143 ("", len(allbytes))
2144 )
2145
2146 def test_decode_with_int2int_map(self):
2147 a = ord('a')
2148 b = ord('b')
2149 c = ord('c')
2150
2151 self.assertEqual(
2152 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2153 {0: a, 1: b, 2: c}),
2154 ("abc", 3)
2155 )
2156
2157 # Issue #15379
2158 self.assertEqual(
2159 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2160 {0: 0x10FFFF, 1: b, 2: c}),
2161 ("\U0010FFFFbc", 3)
2162 )
2163
Antoine Pitroua1f76552012-09-23 20:00:04 +02002164 self.assertEqual(
2165 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2166 {0: sys.maxunicode, 1: b, 2: c}),
2167 (chr(sys.maxunicode) + "bc", 3)
2168 )
2169
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002170 self.assertRaises(TypeError,
2171 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002172 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002173 )
2174
2175 self.assertRaises(UnicodeDecodeError,
2176 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2177 {0: a, 1: b},
2178 )
2179
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002180 self.assertRaises(UnicodeDecodeError,
2181 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2182 {0: a, 1: b, 2: 0xFFFE},
2183 )
2184
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002185 self.assertEqual(
2186 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2187 {0: a, 1: b}),
2188 ("ab\ufffd", 3)
2189 )
2190
2191 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002192 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2193 {0: a, 1: b, 2: 0xFFFE}),
2194 ("ab\ufffd", 3)
2195 )
2196
2197 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002198 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2199 {0: a, 1: b}),
2200 ("ab", 3)
2201 )
2202
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002203 self.assertEqual(
2204 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2205 {0: a, 1: b, 2: 0xFFFE}),
2206 ("ab", 3)
2207 )
2208
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002209
Thomas Wouters89f507f2006-12-13 04:49:30 +00002210class WithStmtTest(unittest.TestCase):
2211 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002212 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002213 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2214 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002215 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002216
2217 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002218 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002219 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002220 with codecs.StreamReaderWriter(f, info.streamreader,
2221 info.streamwriter, 'strict') as srw:
2222 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002223
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002224class TypesTest(unittest.TestCase):
2225 def test_decode_unicode(self):
2226 # Most decoders don't accept unicode input
2227 decoders = [
2228 codecs.utf_7_decode,
2229 codecs.utf_8_decode,
2230 codecs.utf_16_le_decode,
2231 codecs.utf_16_be_decode,
2232 codecs.utf_16_ex_decode,
2233 codecs.utf_32_decode,
2234 codecs.utf_32_le_decode,
2235 codecs.utf_32_be_decode,
2236 codecs.utf_32_ex_decode,
2237 codecs.latin_1_decode,
2238 codecs.ascii_decode,
2239 codecs.charmap_decode,
2240 ]
2241 if hasattr(codecs, "mbcs_decode"):
2242 decoders.append(codecs.mbcs_decode)
2243 for decoder in decoders:
2244 self.assertRaises(TypeError, decoder, "xxx")
2245
2246 def test_unicode_escape(self):
2247 # Escape-decoding an unicode string is supported ang gives the same
2248 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002249 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2250 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2251 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2252 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002253
Victor Stinnere3b47152011-12-09 20:49:49 +01002254 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2255 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2256
2257 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2258 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2259
Serhiy Storchakad6793772013-01-29 10:20:44 +02002260
2261class UnicodeEscapeTest(unittest.TestCase):
2262 def test_empty(self):
2263 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2264 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2265
2266 def test_raw_encode(self):
2267 encode = codecs.unicode_escape_encode
2268 for b in range(32, 127):
2269 if b != b'\\'[0]:
2270 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2271
2272 def test_raw_decode(self):
2273 decode = codecs.unicode_escape_decode
2274 for b in range(256):
2275 if b != b'\\'[0]:
2276 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2277
2278 def test_escape_encode(self):
2279 encode = codecs.unicode_escape_encode
2280 check = coding_checker(self, encode)
2281 check('\t', br'\t')
2282 check('\n', br'\n')
2283 check('\r', br'\r')
2284 check('\\', br'\\')
2285 for b in range(32):
2286 if chr(b) not in '\t\n\r':
2287 check(chr(b), ('\\x%02x' % b).encode())
2288 for b in range(127, 256):
2289 check(chr(b), ('\\x%02x' % b).encode())
2290 check('\u20ac', br'\u20ac')
2291 check('\U0001d120', br'\U0001d120')
2292
2293 def test_escape_decode(self):
2294 decode = codecs.unicode_escape_decode
2295 check = coding_checker(self, decode)
2296 check(b"[\\\n]", "[]")
2297 check(br'[\"]', '["]')
2298 check(br"[\']", "[']")
2299 check(br"[\\]", r"[\]")
2300 check(br"[\a]", "[\x07]")
2301 check(br"[\b]", "[\x08]")
2302 check(br"[\t]", "[\x09]")
2303 check(br"[\n]", "[\x0a]")
2304 check(br"[\v]", "[\x0b]")
2305 check(br"[\f]", "[\x0c]")
2306 check(br"[\r]", "[\x0d]")
2307 check(br"[\7]", "[\x07]")
2308 check(br"[\8]", r"[\8]")
2309 check(br"[\78]", "[\x078]")
2310 check(br"[\41]", "[!]")
2311 check(br"[\418]", "[!8]")
2312 check(br"[\101]", "[A]")
2313 check(br"[\1010]", "[A0]")
2314 check(br"[\x41]", "[A]")
2315 check(br"[\x410]", "[A0]")
2316 check(br"\u20ac", "\u20ac")
2317 check(br"\U0001d120", "\U0001d120")
2318 for b in range(256):
2319 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2320 check(b'\\' + bytes([b]), '\\' + chr(b))
2321
2322 def test_decode_errors(self):
2323 decode = codecs.unicode_escape_decode
2324 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2325 for i in range(d):
2326 self.assertRaises(UnicodeDecodeError, decode,
2327 b"\\" + c + b"0"*i)
2328 self.assertRaises(UnicodeDecodeError, decode,
2329 b"[\\" + c + b"0"*i + b"]")
2330 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2331 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2332 self.assertEqual(decode(data, "replace"),
2333 ("[\ufffd]\ufffd", len(data)))
2334 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2335 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2336 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2337
2338
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002339class RawUnicodeEscapeTest(unittest.TestCase):
2340 def test_empty(self):
2341 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2342 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2343
2344 def test_raw_encode(self):
2345 encode = codecs.raw_unicode_escape_encode
2346 for b in range(256):
2347 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2348
2349 def test_raw_decode(self):
2350 decode = codecs.raw_unicode_escape_decode
2351 for b in range(256):
2352 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2353
2354 def test_escape_encode(self):
2355 encode = codecs.raw_unicode_escape_encode
2356 check = coding_checker(self, encode)
2357 for b in range(256):
2358 if b not in b'uU':
2359 check('\\' + chr(b), b'\\' + bytes([b]))
2360 check('\u20ac', br'\u20ac')
2361 check('\U0001d120', br'\U0001d120')
2362
2363 def test_escape_decode(self):
2364 decode = codecs.raw_unicode_escape_decode
2365 check = coding_checker(self, decode)
2366 for b in range(256):
2367 if b not in b'uU':
2368 check(b'\\' + bytes([b]), '\\' + chr(b))
2369 check(br"\u20ac", "\u20ac")
2370 check(br"\U0001d120", "\U0001d120")
2371
2372 def test_decode_errors(self):
2373 decode = codecs.raw_unicode_escape_decode
2374 for c, d in (b'u', 4), (b'U', 4):
2375 for i in range(d):
2376 self.assertRaises(UnicodeDecodeError, decode,
2377 b"\\" + c + b"0"*i)
2378 self.assertRaises(UnicodeDecodeError, decode,
2379 b"[\\" + c + b"0"*i + b"]")
2380 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2381 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2382 self.assertEqual(decode(data, "replace"),
2383 ("[\ufffd]\ufffd", len(data)))
2384 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2385 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2386 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2387
2388
Martin v. Löwis43c57782009-05-10 08:15:24 +00002389class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002390
2391 def test_utf8(self):
2392 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002393 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002394 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002395 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002396 b"foo\x80bar")
2397 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002398 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002399 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002400 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002401 b"\xed\xb0\x80")
2402
2403 def test_ascii(self):
2404 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002405 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002406 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002407 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002408 b"foo\x80bar")
2409
2410 def test_charmap(self):
2411 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002412 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002413 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002414 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002415 b"foo\xa5bar")
2416
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002417 def test_latin1(self):
2418 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002419 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002420 b"\xe4\xeb\xef\xf6\xfc")
2421
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002422
Victor Stinner3fed0872010-05-22 02:16:27 +00002423class BomTest(unittest.TestCase):
2424 def test_seek0(self):
2425 data = "1234567890"
2426 tests = ("utf-16",
2427 "utf-16-le",
2428 "utf-16-be",
2429 "utf-32",
2430 "utf-32-le",
2431 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002432 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002433 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002434 # Check if the BOM is written only once
2435 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002436 f.write(data)
2437 f.write(data)
2438 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002439 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002440 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002441 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002442
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002443 # Check that the BOM is written after a seek(0)
2444 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2445 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002446 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002447 f.seek(0)
2448 f.write(data)
2449 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002450 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002451
2452 # (StreamWriter) Check that the BOM is written after a seek(0)
2453 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002454 f.writer.write(data[0])
2455 self.assertNotEqual(f.writer.tell(), 0)
2456 f.writer.seek(0)
2457 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002458 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002459 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002460
Victor Stinner05010702011-05-27 16:50:40 +02002461 # Check that the BOM is not written after a seek() at a position
2462 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002463 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2464 f.write(data)
2465 f.seek(f.tell())
2466 f.write(data)
2467 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002468 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002469
Victor Stinner05010702011-05-27 16:50:40 +02002470 # (StreamWriter) Check that the BOM is not written after a seek()
2471 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002472 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002473 f.writer.write(data)
2474 f.writer.seek(f.writer.tell())
2475 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002476 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002477 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002478
Victor Stinner3fed0872010-05-22 02:16:27 +00002479
Georg Brandl02524622010-12-02 18:06:51 +00002480bytes_transform_encodings = [
2481 "base64_codec",
2482 "uu_codec",
2483 "quopri_codec",
2484 "hex_codec",
2485]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002486
2487transform_aliases = {
2488 "base64_codec": ["base64", "base_64"],
2489 "uu_codec": ["uu"],
2490 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2491 "hex_codec": ["hex"],
2492 "rot_13": ["rot13"],
2493}
2494
Georg Brandl02524622010-12-02 18:06:51 +00002495try:
2496 import zlib
2497except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002498 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002499else:
2500 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002501 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002502try:
2503 import bz2
2504except ImportError:
2505 pass
2506else:
2507 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002508 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002509
2510class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002511
Georg Brandl02524622010-12-02 18:06:51 +00002512 def test_basics(self):
2513 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002514 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002515 with self.subTest(encoding=encoding):
2516 # generic codecs interface
2517 (o, size) = codecs.getencoder(encoding)(binput)
2518 self.assertEqual(size, len(binput))
2519 (i, size) = codecs.getdecoder(encoding)(o)
2520 self.assertEqual(size, len(o))
2521 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002522
Georg Brandl02524622010-12-02 18:06:51 +00002523 def test_read(self):
2524 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002525 with self.subTest(encoding=encoding):
2526 sin = codecs.encode(b"\x80", encoding)
2527 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2528 sout = reader.read()
2529 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002530
2531 def test_readline(self):
2532 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002533 with self.subTest(encoding=encoding):
2534 sin = codecs.encode(b"\x80", encoding)
2535 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2536 sout = reader.readline()
2537 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002538
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002539 def test_buffer_api_usage(self):
2540 # We check all the transform codecs accept memoryview input
2541 # for encoding and decoding
2542 # and also that they roundtrip correctly
2543 original = b"12345\x80"
2544 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002545 with self.subTest(encoding=encoding):
2546 data = original
2547 view = memoryview(data)
2548 data = codecs.encode(data, encoding)
2549 view_encoded = codecs.encode(view, encoding)
2550 self.assertEqual(view_encoded, data)
2551 view = memoryview(data)
2552 data = codecs.decode(data, encoding)
2553 self.assertEqual(data, original)
2554 view_decoded = codecs.decode(view, encoding)
2555 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002556
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002557 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002558 # Check binary -> binary codecs give a good error for str input
2559 bad_input = "bad input type"
2560 for encoding in bytes_transform_encodings:
2561 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002562 fmt = ( "{!r} is not a text encoding; "
2563 "use codecs.encode\(\) to handle arbitrary codecs")
2564 msg = fmt.format(encoding)
2565 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002566 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002567 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002568
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002569 def test_text_to_binary_blacklists_text_transforms(self):
2570 # Check str.encode gives a good error message for str -> str codecs
2571 msg = (r"^'rot_13' is not a text encoding; "
2572 "use codecs.encode\(\) to handle arbitrary codecs")
2573 with self.assertRaisesRegex(LookupError, msg):
2574 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002575
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002576 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002577 # Check bytes.decode and bytearray.decode give a good error
2578 # message for binary -> binary codecs
2579 data = b"encode first to ensure we meet any format restrictions"
2580 for encoding in bytes_transform_encodings:
2581 with self.subTest(encoding=encoding):
2582 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002583 fmt = (r"{!r} is not a text encoding; "
2584 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002585 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002586 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002587 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002588 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002589 bytearray(encoded_data).decode(encoding)
2590
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002591 def test_binary_to_text_blacklists_text_transforms(self):
2592 # Check str -> str codec gives a good error for binary input
2593 for bad_input in (b"immutable", bytearray(b"mutable")):
2594 with self.subTest(bad_input=bad_input):
2595 msg = (r"^'rot_13' is not a text encoding; "
2596 "use codecs.decode\(\) to handle arbitrary codecs")
2597 with self.assertRaisesRegex(LookupError, msg) as failure:
2598 bad_input.decode("rot_13")
2599 self.assertIsNone(failure.exception.__cause__)
2600
Zachary Wareefa2e042013-12-30 14:54:11 -06002601 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002602 def test_custom_zlib_error_is_wrapped(self):
2603 # Check zlib codec gives a good error for malformed input
2604 msg = "^decoding with 'zlib_codec' codec failed"
2605 with self.assertRaisesRegex(Exception, msg) as failure:
2606 codecs.decode(b"hello", "zlib_codec")
2607 self.assertIsInstance(failure.exception.__cause__,
2608 type(failure.exception))
2609
2610 def test_custom_hex_error_is_wrapped(self):
2611 # Check hex codec gives a good error for malformed input
2612 msg = "^decoding with 'hex_codec' codec failed"
2613 with self.assertRaisesRegex(Exception, msg) as failure:
2614 codecs.decode(b"hello", "hex_codec")
2615 self.assertIsInstance(failure.exception.__cause__,
2616 type(failure.exception))
2617
2618 # Unfortunately, the bz2 module throws OSError, which the codec
2619 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002620
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002621 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2622 def test_aliases(self):
2623 for codec_name, aliases in transform_aliases.items():
2624 expected_name = codecs.lookup(codec_name).name
2625 for alias in aliases:
2626 with self.subTest(alias=alias):
2627 info = codecs.lookup(alias)
2628 self.assertEqual(info.name, expected_name)
2629
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002630 def test_uu_invalid(self):
2631 # Missing "begin" line
2632 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2633
Nick Coghlan8b097b42013-11-13 23:49:21 +10002634
2635# The codec system tries to wrap exceptions in order to ensure the error
2636# mentions the operation being performed and the codec involved. We
2637# currently *only* want this to happen for relatively stateless
2638# exceptions, where the only significant information they contain is their
2639# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002640
2641# Use a local codec registry to avoid appearing to leak objects when
2642# registering multiple seach functions
2643_TEST_CODECS = {}
2644
2645def _get_test_codec(codec_name):
2646 return _TEST_CODECS.get(codec_name)
2647codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2648
Nick Coghlan8fad1672014-09-15 23:50:44 +12002649try:
2650 # Issue #22166: Also need to clear the internal cache in CPython
2651 from _codecs import _forget_codec
2652except ImportError:
2653 def _forget_codec(codec_name):
2654 pass
2655
2656
Nick Coghlan8b097b42013-11-13 23:49:21 +10002657class ExceptionChainingTest(unittest.TestCase):
2658
2659 def setUp(self):
2660 # There's no way to unregister a codec search function, so we just
2661 # ensure we render this one fairly harmless after the test
2662 # case finishes by using the test case repr as the codec name
2663 # The codecs module normalizes codec names, although this doesn't
2664 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002665 # We also make sure we use a truly unique id for the custom codec
2666 # to avoid issues with the codec cache when running these tests
2667 # multiple times (e.g. when hunting for refleaks)
2668 unique_id = repr(self) + str(id(self))
2669 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2670
2671 # We store the object to raise on the instance because of a bad
2672 # interaction between the codec caching (which means we can't
2673 # recreate the codec entry) and regrtest refleak hunting (which
2674 # runs the same test instance multiple times). This means we
2675 # need to ensure the codecs call back in to the instance to find
2676 # out which exception to raise rather than binding them in a
2677 # closure to an object that may change on the next run
2678 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002679
Nick Coghlan4e553e22013-11-16 00:35:34 +10002680 def tearDown(self):
2681 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002682 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2683 encodings._cache.pop(self.codec_name, None)
2684 try:
2685 _forget_codec(self.codec_name)
2686 except KeyError:
2687 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002688
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002689 def set_codec(self, encode, decode):
2690 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002691 name=self.codec_name)
2692 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002693
2694 @contextlib.contextmanager
2695 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002696 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002697 operation, self.codec_name, exc_type.__name__, msg)
2698 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2699 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002700 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002701 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002702
2703 def raise_obj(self, *args, **kwds):
2704 # Helper to dynamically change the object raised by a test codec
2705 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002706
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002707 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002708 self.obj_to_raise = obj_to_raise
2709 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002710 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002711 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002712 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002713 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002714 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002715 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002716 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002717 codecs.decode(b"bytes input", self.codec_name)
2718
2719 def test_raise_by_type(self):
2720 self.check_wrapped(RuntimeError, "")
2721
2722 def test_raise_by_value(self):
2723 msg = "This should be wrapped"
2724 self.check_wrapped(RuntimeError(msg), msg)
2725
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002726 def test_raise_grandchild_subclass_exact_size(self):
2727 msg = "This should be wrapped"
2728 class MyRuntimeError(RuntimeError):
2729 __slots__ = ()
2730 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2731
2732 def test_raise_subclass_with_weakref_support(self):
2733 msg = "This should be wrapped"
2734 class MyRuntimeError(RuntimeError):
2735 pass
2736 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2737
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002738 def check_not_wrapped(self, obj_to_raise, msg):
2739 def raise_obj(*args, **kwds):
2740 raise obj_to_raise
2741 self.set_codec(raise_obj, raise_obj)
2742 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002743 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002744 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002745 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002746 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002747 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002748 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002749 codecs.decode(b"bytes input", self.codec_name)
2750
2751 def test_init_override_is_not_wrapped(self):
2752 class CustomInit(RuntimeError):
2753 def __init__(self):
2754 pass
2755 self.check_not_wrapped(CustomInit, "")
2756
2757 def test_new_override_is_not_wrapped(self):
2758 class CustomNew(RuntimeError):
2759 def __new__(cls):
2760 return super().__new__(cls)
2761 self.check_not_wrapped(CustomNew, "")
2762
2763 def test_instance_attribute_is_not_wrapped(self):
2764 msg = "This should NOT be wrapped"
2765 exc = RuntimeError(msg)
2766 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002767 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002768
2769 def test_non_str_arg_is_not_wrapped(self):
2770 self.check_not_wrapped(RuntimeError(1), "1")
2771
2772 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002773 msg_re = r"^\('a', 'b', 'c'\)$"
2774 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002775
2776 # http://bugs.python.org/issue19609
2777 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002778 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002779 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002780 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002781 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002782 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002783 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002784 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002785 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002786 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002787 codecs.decode(b"bytes input", self.codec_name)
2788
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002789 def test_unflagged_non_text_codec_handling(self):
2790 # The stdlib non-text codecs are now marked so they're
2791 # pre-emptively skipped by the text model related methods
2792 # However, third party codecs won't be flagged, so we still make
2793 # sure the case where an inappropriate output type is produced is
2794 # handled appropriately
2795 def encode_to_str(*args, **kwds):
2796 return "not bytes!", 0
2797 def decode_to_bytes(*args, **kwds):
2798 return b"not str!", 0
2799 self.set_codec(encode_to_str, decode_to_bytes)
2800 # No input or output type checks on the codecs module functions
2801 encoded = codecs.encode(None, self.codec_name)
2802 self.assertEqual(encoded, "not bytes!")
2803 decoded = codecs.decode(None, self.codec_name)
2804 self.assertEqual(decoded, b"not str!")
2805 # Text model methods should complain
2806 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2807 "use codecs.encode\(\) to encode to arbitrary types$")
2808 msg = fmt.format(self.codec_name)
2809 with self.assertRaisesRegex(TypeError, msg):
2810 "str_input".encode(self.codec_name)
2811 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2812 "use codecs.decode\(\) to decode to arbitrary types$")
2813 msg = fmt.format(self.codec_name)
2814 with self.assertRaisesRegex(TypeError, msg):
2815 b"bytes input".decode(self.codec_name)
2816
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002817
Georg Brandl02524622010-12-02 18:06:51 +00002818
Victor Stinner62be4fb2011-10-18 21:46:37 +02002819@unittest.skipUnless(sys.platform == 'win32',
2820 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002821class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002822 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002823 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002824
Victor Stinner3a50e702011-10-18 21:21:00 +02002825 def test_invalid_code_page(self):
2826 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2827 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002828 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2829 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002830
2831 def test_code_page_name(self):
2832 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2833 codecs.code_page_encode, 932, '\xff')
2834 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002835 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002836 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002837 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002838
2839 def check_decode(self, cp, tests):
2840 for raw, errors, expected in tests:
2841 if expected is not None:
2842 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002843 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002844 except UnicodeDecodeError as err:
2845 self.fail('Unable to decode %a from "cp%s" with '
2846 'errors=%r: %s' % (raw, cp, errors, err))
2847 self.assertEqual(decoded[0], expected,
2848 '%a.decode("cp%s", %r)=%a != %a'
2849 % (raw, cp, errors, decoded[0], expected))
2850 # assert 0 <= decoded[1] <= len(raw)
2851 self.assertGreaterEqual(decoded[1], 0)
2852 self.assertLessEqual(decoded[1], len(raw))
2853 else:
2854 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002855 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002856
2857 def check_encode(self, cp, tests):
2858 for text, errors, expected in tests:
2859 if expected is not None:
2860 try:
2861 encoded = codecs.code_page_encode(cp, text, errors)
2862 except UnicodeEncodeError as err:
2863 self.fail('Unable to encode %a to "cp%s" with '
2864 'errors=%r: %s' % (text, cp, errors, err))
2865 self.assertEqual(encoded[0], expected,
2866 '%a.encode("cp%s", %r)=%a != %a'
2867 % (text, cp, errors, encoded[0], expected))
2868 self.assertEqual(encoded[1], len(text))
2869 else:
2870 self.assertRaises(UnicodeEncodeError,
2871 codecs.code_page_encode, cp, text, errors)
2872
2873 def test_cp932(self):
2874 self.check_encode(932, (
2875 ('abc', 'strict', b'abc'),
2876 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002877 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002878 ('\xff', 'strict', None),
2879 ('[\xff]', 'ignore', b'[]'),
2880 ('[\xff]', 'replace', b'[y]'),
2881 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002882 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002883 ('[\xff]', 'namereplace',
2884 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002885 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002886 ('\udcff', 'strict', None),
2887 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2888 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002889 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002890 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002891 (b'abc', 'strict', 'abc'),
2892 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2893 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002894 (b'[\xff]', 'strict', None),
2895 (b'[\xff]', 'ignore', '[]'),
2896 (b'[\xff]', 'replace', '[\ufffd]'),
2897 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002898 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002899 (b'\x81\x00abc', 'strict', None),
2900 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002901 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2902 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002903
2904 def test_cp1252(self):
2905 self.check_encode(1252, (
2906 ('abc', 'strict', b'abc'),
2907 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2908 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002909 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002910 ('\u0141', 'strict', None),
2911 ('\u0141', 'ignore', b''),
2912 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002913 ('\udc98', 'surrogateescape', b'\x98'),
2914 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002915 ))
2916 self.check_decode(1252, (
2917 (b'abc', 'strict', 'abc'),
2918 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2919 (b'\xff', 'strict', '\xff'),
2920 ))
2921
2922 def test_cp_utf7(self):
2923 cp = 65000
2924 self.check_encode(cp, (
2925 ('abc', 'strict', b'abc'),
2926 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2927 ('\U0010ffff', 'strict', b'+2//f/w-'),
2928 ('\udc80', 'strict', b'+3IA-'),
2929 ('\ufffd', 'strict', b'+//0-'),
2930 ))
2931 self.check_decode(cp, (
2932 (b'abc', 'strict', 'abc'),
2933 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2934 (b'+2//f/w-', 'strict', '\U0010ffff'),
2935 (b'+3IA-', 'strict', '\udc80'),
2936 (b'+//0-', 'strict', '\ufffd'),
2937 # invalid bytes
2938 (b'[+/]', 'strict', '[]'),
2939 (b'[\xff]', 'strict', '[\xff]'),
2940 ))
2941
Victor Stinner3a50e702011-10-18 21:21:00 +02002942 def test_multibyte_encoding(self):
2943 self.check_decode(932, (
2944 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2945 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2946 ))
2947 self.check_decode(self.CP_UTF8, (
2948 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2949 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2950 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002951 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002952 self.check_encode(self.CP_UTF8, (
2953 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2954 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2955 ))
2956
2957 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002958 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2959 self.assertEqual(decoded, ('', 0))
2960
Victor Stinner3a50e702011-10-18 21:21:00 +02002961 decoded = codecs.code_page_decode(932,
2962 b'\xe9\x80\xe9', 'strict',
2963 False)
2964 self.assertEqual(decoded, ('\u9a3e', 2))
2965
2966 decoded = codecs.code_page_decode(932,
2967 b'\xe9\x80\xe9\x80', 'strict',
2968 False)
2969 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2970
2971 decoded = codecs.code_page_decode(932,
2972 b'abc', 'strict',
2973 False)
2974 self.assertEqual(decoded, ('abc', 3))
2975
2976
Fred Drake2e2be372001-09-20 21:33:42 +00002977if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002978 unittest.main()