blob: 254c0c1d64fb7d051b94d5915e8a7e87559d5607 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Victor Stinnerf96418d2015-09-21 23:06:27 +020030
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Victor Stinnerf96418d2015-09-21 23:06:27 +020051
Walter Dörwald3abcb012007-04-16 22:10:50 +000052class MixInCheckStateHandling:
53 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000054 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 d = codecs.getincrementaldecoder(encoding)()
56 part1 = d.decode(s[:i])
57 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000058 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000059 # Check that the condition stated in the documentation for
60 # IncrementalDecoder.getstate() holds
61 if not state[1]:
62 # reset decoder to the default state without anything buffered
63 d.setstate((state[0][:0], 0))
64 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000065 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000066 # The decoder must return to the same state
67 self.assertEqual(state, d.getstate())
68 # Create a new decoder and set it to the state
69 # we extracted from the old one
70 d = codecs.getincrementaldecoder(encoding)()
71 d.setstate(state)
72 part2 = d.decode(s[i:], True)
73 self.assertEqual(u, part1+part2)
74
75 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000076 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000077 d = codecs.getincrementalencoder(encoding)()
78 part1 = d.encode(u[:i])
79 state = d.getstate()
80 d = codecs.getincrementalencoder(encoding)()
81 d.setstate(state)
82 part2 = d.encode(u[i:], True)
83 self.assertEqual(s, part1+part2)
84
Victor Stinnerf96418d2015-09-21 23:06:27 +020085
Ezio Melotti5d3dba02013-01-11 06:02:07 +020086class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000088 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000089 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000090 # the StreamReader and check that the results equal the appropriate
91 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020093 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000097 result += r.read()
98 self.assertEqual(result, partialresult)
99 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000102
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 # do the check again, this time using a incremental decoder
104 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000105 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000108 self.assertEqual(result, partialresult)
109 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 self.assertEqual(d.decode(b"", True), "")
111 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000113 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000115 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000118 self.assertEqual(result, partialresult)
119 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000120 self.assertEqual(d.decode(b"", True), "")
121 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000122
123 # check iterdecode()
124 encoded = input.encode(self.encoding)
125 self.assertEqual(
126 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000127 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000128 )
129
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 def test_readline(self):
131 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000132 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 return codecs.getreader(self.encoding)(stream)
134
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200136 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 lines = []
138 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000140 if not line:
141 break
142 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000144
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
146 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
147 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000148 self.assertEqual(readalllines(s, True), sexpected)
149 self.assertEqual(readalllines(s, False), sexpectednoends)
150 self.assertEqual(readalllines(s, True, 10), sexpected)
151 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200153 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000154 # Test long lines (multiple calls to read() in readline())
155 vw = []
156 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200157 for (i, lineend) in enumerate(lineends):
158 vw.append((i*200+200)*"\u3042" + lineend)
159 vwo.append((i*200+200)*"\u3042")
160 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
161 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000162
163 # Test lines where the first read might end with \r, so the
164 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200166 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000169 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 self.assertEqual(
171 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000172 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200174 self.assertEqual(
175 reader.readline(keepends=True),
176 "xxx\n",
177 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000179 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 self.assertEqual(
181 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000182 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200184 self.assertEqual(
185 reader.readline(keepends=False),
186 "xxx",
187 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000188
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200189 def test_mixed_readline_and_read(self):
190 lines = ["Humpty Dumpty sat on a wall,\n",
191 "Humpty Dumpty had a great fall.\r\n",
192 "All the king's horses and all the king's men\r",
193 "Couldn't put Humpty together again."]
194 data = ''.join(lines)
195 def getreader():
196 stream = io.BytesIO(data.encode(self.encoding))
197 return codecs.getreader(self.encoding)(stream)
198
199 # Issue #8260: Test readline() followed by read()
200 f = getreader()
201 self.assertEqual(f.readline(), lines[0])
202 self.assertEqual(f.read(), ''.join(lines[1:]))
203 self.assertEqual(f.read(), '')
204
205 # Issue #16636: Test readline() followed by readlines()
206 f = getreader()
207 self.assertEqual(f.readline(), lines[0])
208 self.assertEqual(f.readlines(), lines[1:])
209 self.assertEqual(f.read(), '')
210
211 # Test read() followed by read()
212 f = getreader()
213 self.assertEqual(f.read(size=40, chars=5), data[:5])
214 self.assertEqual(f.read(), data[5:])
215 self.assertEqual(f.read(), '')
216
217 # Issue #12446: Test read() followed by readlines()
218 f = getreader()
219 self.assertEqual(f.read(size=40, chars=5), data[:5])
220 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
221 self.assertEqual(f.read(), '')
222
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000223 def test_bug1175396(self):
224 s = [
225 '<%!--===================================================\r\n',
226 ' BLOG index page: show recent articles,\r\n',
227 ' today\'s articles, or articles of a specific date.\r\n',
228 '========================================================--%>\r\n',
229 '<%@inputencoding="ISO-8859-1"%>\r\n',
230 '<%@pagetemplate=TEMPLATE.y%>\r\n',
231 '<%@import=import frog.util, frog%>\r\n',
232 '<%@import=import frog.objects%>\r\n',
233 '<%@import=from frog.storageerrors import StorageError%>\r\n',
234 '<%\r\n',
235 '\r\n',
236 'import logging\r\n',
237 'log=logging.getLogger("Snakelets.logger")\r\n',
238 '\r\n',
239 '\r\n',
240 'user=self.SessionCtx.user\r\n',
241 'storageEngine=self.SessionCtx.storageEngine\r\n',
242 '\r\n',
243 '\r\n',
244 'def readArticlesFromDate(date, count=None):\r\n',
245 ' entryids=storageEngine.listBlogEntries(date)\r\n',
246 ' entryids.reverse() # descending\r\n',
247 ' if count:\r\n',
248 ' entryids=entryids[:count]\r\n',
249 ' try:\r\n',
250 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
251 ' except StorageError,x:\r\n',
252 ' log.error("Error loading articles: "+str(x))\r\n',
253 ' self.abort("cannot load articles")\r\n',
254 '\r\n',
255 'showdate=None\r\n',
256 '\r\n',
257 'arg=self.Request.getArg()\r\n',
258 'if arg=="today":\r\n',
259 ' #-------------------- TODAY\'S ARTICLES\r\n',
260 ' self.write("<h2>Today\'s articles</h2>")\r\n',
261 ' showdate = frog.util.isodatestr() \r\n',
262 ' entries = readArticlesFromDate(showdate)\r\n',
263 'elif arg=="active":\r\n',
264 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
265 ' self.Yredirect("active.y")\r\n',
266 'elif arg=="login":\r\n',
267 ' #-------------------- LOGIN PAGE redirect\r\n',
268 ' self.Yredirect("login.y")\r\n',
269 'elif arg=="date":\r\n',
270 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
271 ' showdate = self.Request.getParameter("date")\r\n',
272 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
273 ' entries = readArticlesFromDate(showdate)\r\n',
274 'else:\r\n',
275 ' #-------------------- RECENT ARTICLES\r\n',
276 ' self.write("<h2>Recent articles</h2>")\r\n',
277 ' dates=storageEngine.listBlogEntryDates()\r\n',
278 ' if dates:\r\n',
279 ' entries=[]\r\n',
280 ' SHOWAMOUNT=10\r\n',
281 ' for showdate in dates:\r\n',
282 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
283 ' if len(entries)>=SHOWAMOUNT:\r\n',
284 ' break\r\n',
285 ' \r\n',
286 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000287 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200288 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000289 for (i, line) in enumerate(reader):
290 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000291
292 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000293 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200294 writer = codecs.getwriter(self.encoding)(q)
295 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000296
297 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000298 writer.write("foo\r")
299 self.assertEqual(reader.readline(keepends=False), "foo")
300 writer.write("\nbar\r")
301 self.assertEqual(reader.readline(keepends=False), "")
302 self.assertEqual(reader.readline(keepends=False), "bar")
303 writer.write("baz")
304 self.assertEqual(reader.readline(keepends=False), "baz")
305 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000306
307 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000308 writer.write("foo\r")
309 self.assertEqual(reader.readline(keepends=True), "foo\r")
310 writer.write("\nbar\r")
311 self.assertEqual(reader.readline(keepends=True), "\n")
312 self.assertEqual(reader.readline(keepends=True), "bar\r")
313 writer.write("baz")
314 self.assertEqual(reader.readline(keepends=True), "baz")
315 self.assertEqual(reader.readline(keepends=True), "")
316 writer.write("foo\r\n")
317 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000318
Walter Dörwald9fa09462005-01-10 12:01:39 +0000319 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000320 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
321 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
322 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000323
324 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000325 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200326 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327 self.assertEqual(reader.readline(), s1)
328 self.assertEqual(reader.readline(), s2)
329 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000331
332 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000333 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
334 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
335 s3 = "stillokay:bbbbxx\r\n"
336 s4 = "broken!!!!badbad\r\n"
337 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000338
339 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000340 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200341 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000342 self.assertEqual(reader.readline(), s1)
343 self.assertEqual(reader.readline(), s2)
344 self.assertEqual(reader.readline(), s3)
345 self.assertEqual(reader.readline(), s4)
346 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000347 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000348
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200349 ill_formed_sequence_replace = "\ufffd"
350
351 def test_lone_surrogates(self):
352 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
353 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
354 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200355 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
356 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200357 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
358 "[&#56448;]".encode(self.encoding))
359 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
360 "[]".encode(self.encoding))
361 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
362 "[?]".encode(self.encoding))
363
364 bom = "".encode(self.encoding)
365 for before, after in [("\U00010fff", "A"), ("[", "]"),
366 ("A", "\U00010fff")]:
367 before_sequence = before.encode(self.encoding)[len(bom):]
368 after_sequence = after.encode(self.encoding)[len(bom):]
369 test_string = before + "\uDC80" + after
370 test_sequence = (bom + before_sequence +
371 self.ill_formed_sequence + after_sequence)
372 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
373 self.encoding)
374 self.assertEqual(test_string.encode(self.encoding,
375 "surrogatepass"),
376 test_sequence)
377 self.assertEqual(test_sequence.decode(self.encoding,
378 "surrogatepass"),
379 test_string)
380 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
381 before + after)
382 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
383 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200384 backslashreplace = ''.join('\\x%02x' % b
385 for b in self.ill_formed_sequence)
386 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
387 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388
Victor Stinnerf96418d2015-09-21 23:06:27 +0200389
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200390class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000391 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200392 if sys.byteorder == 'little':
393 ill_formed_sequence = b"\x80\xdc\x00\x00"
394 else:
395 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000396
397 spamle = (b'\xff\xfe\x00\x00'
398 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
399 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
400 spambe = (b'\x00\x00\xfe\xff'
401 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
402 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
403
404 def test_only_one_bom(self):
405 _,_,reader,writer = codecs.lookup(self.encoding)
406 # encode some stream
407 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200408 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000409 f.write("spam")
410 f.write("spam")
411 d = s.getvalue()
412 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000413 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 # try to read it back
415 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200416 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000417 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418
419 def test_badbom(self):
420 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200425 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000426 self.assertRaises(UnicodeError, f.read)
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200430 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000431 [
432 "", # first byte of BOM read
433 "", # second byte of BOM read
434 "", # third byte of BOM read
435 "", # fourth byte of BOM read => byteorder known
436 "",
437 "",
438 "",
439 "\x00",
440 "\x00",
441 "\x00",
442 "\x00",
443 "\x00\xff",
444 "\x00\xff",
445 "\x00\xff",
446 "\x00\xff",
447 "\x00\xff\u0100",
448 "\x00\xff\u0100",
449 "\x00\xff\u0100",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200452 "\x00\xff\u0100\uffff",
453 "\x00\xff\u0100\uffff",
454 "\x00\xff\u0100\uffff",
455 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000456 ]
457 )
458
Georg Brandl791f4e12009-09-17 11:41:24 +0000459 def test_handlers(self):
460 self.assertEqual(('\ufffd', 1),
461 codecs.utf_32_decode(b'\x01', 'replace', True))
462 self.assertEqual(('', 1),
463 codecs.utf_32_decode(b'\x01', 'ignore', True))
464
Walter Dörwald41980ca2007-08-16 21:55:45 +0000465 def test_errors(self):
466 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
467 b"\xff", "strict", True)
468
469 def test_decoder_state(self):
470 self.check_state_handling_decode(self.encoding,
471 "spamspam", self.spamle)
472 self.check_state_handling_decode(self.encoding,
473 "spamspam", self.spambe)
474
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000475 def test_issue8941(self):
476 # Issue #8941: insufficient result allocation when decoding into
477 # surrogate pairs on UCS-2 builds.
478 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
479 self.assertEqual('\U00010000' * 1024,
480 codecs.utf_32_decode(encoded_le)[0])
481 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
482 self.assertEqual('\U00010000' * 1024,
483 codecs.utf_32_decode(encoded_be)[0])
484
Victor Stinnerf96418d2015-09-21 23:06:27 +0200485
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200486class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000487 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200488 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000489
490 def test_partial(self):
491 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200492 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000493 [
494 "",
495 "",
496 "",
497 "\x00",
498 "\x00",
499 "\x00",
500 "\x00",
501 "\x00\xff",
502 "\x00\xff",
503 "\x00\xff",
504 "\x00\xff",
505 "\x00\xff\u0100",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100",
509 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff",
512 "\x00\xff\u0100\uffff",
513 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000514 ]
515 )
516
517 def test_simple(self):
518 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
519
520 def test_errors(self):
521 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
522 b"\xff", "strict", True)
523
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000524 def test_issue8941(self):
525 # Issue #8941: insufficient result allocation when decoding into
526 # surrogate pairs on UCS-2 builds.
527 encoded = b'\x00\x00\x01\x00' * 1024
528 self.assertEqual('\U00010000' * 1024,
529 codecs.utf_32_le_decode(encoded)[0])
530
Victor Stinnerf96418d2015-09-21 23:06:27 +0200531
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200532class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200534 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000535
536 def test_partial(self):
537 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200538 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000539 [
540 "",
541 "",
542 "",
543 "\x00",
544 "\x00",
545 "\x00",
546 "\x00",
547 "\x00\xff",
548 "\x00\xff",
549 "\x00\xff",
550 "\x00\xff",
551 "\x00\xff\u0100",
552 "\x00\xff\u0100",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200556 "\x00\xff\u0100\uffff",
557 "\x00\xff\u0100\uffff",
558 "\x00\xff\u0100\uffff",
559 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000560 ]
561 )
562
563 def test_simple(self):
564 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
565
566 def test_errors(self):
567 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
568 b"\xff", "strict", True)
569
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000570 def test_issue8941(self):
571 # Issue #8941: insufficient result allocation when decoding into
572 # surrogate pairs on UCS-2 builds.
573 encoded = b'\x00\x01\x00\x00' * 1024
574 self.assertEqual('\U00010000' * 1024,
575 codecs.utf_32_be_decode(encoded)[0])
576
577
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200578class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200580 if sys.byteorder == 'little':
581 ill_formed_sequence = b"\x80\xdc"
582 else:
583 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000585 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
586 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000587
588 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000589 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000591 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200592 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 f.write("spam")
594 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000595 d = s.getvalue()
596 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000597 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000598 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000599 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200600 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000601 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000602
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000603 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000604 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200605 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000606 self.assertRaises(UnicodeError, f.read)
607
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000608 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200609 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000610 self.assertRaises(UnicodeError, f.read)
611
Walter Dörwald69652032004-09-07 20:24:22 +0000612 def test_partial(self):
613 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000615 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000616 "", # first byte of BOM read
617 "", # second byte of BOM read => byteorder known
618 "",
619 "\x00",
620 "\x00",
621 "\x00\xff",
622 "\x00\xff",
623 "\x00\xff\u0100",
624 "\x00\xff\u0100",
625 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200626 "\x00\xff\u0100\uffff",
627 "\x00\xff\u0100\uffff",
628 "\x00\xff\u0100\uffff",
629 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 ]
631 )
632
Georg Brandl791f4e12009-09-17 11:41:24 +0000633 def test_handlers(self):
634 self.assertEqual(('\ufffd', 1),
635 codecs.utf_16_decode(b'\x01', 'replace', True))
636 self.assertEqual(('', 1),
637 codecs.utf_16_decode(b'\x01', 'ignore', True))
638
Walter Dörwalde22d3392005-11-17 08:52:34 +0000639 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000641 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000642
643 def test_decoder_state(self):
644 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000645 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000646 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000647 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000648
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000649 def test_bug691291(self):
650 # Files are always opened in binary mode, even if no binary mode was
651 # specified. This means that no automatic conversion of '\n' is done
652 # on reading and writing.
653 s1 = 'Hello\r\nworld\r\n'
654
655 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.addCleanup(support.unlink, support.TESTFN)
657 with open(support.TESTFN, 'wb') as fp:
658 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200659 with support.check_warnings(('', DeprecationWarning)):
660 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
661 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200662 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000663
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200664class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000665 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200666 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000667
668 def test_partial(self):
669 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 "",
673 "\x00",
674 "\x00",
675 "\x00\xff",
676 "\x00\xff",
677 "\x00\xff\u0100",
678 "\x00\xff\u0100",
679 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200680 "\x00\xff\u0100\uffff",
681 "\x00\xff\u0100\uffff",
682 "\x00\xff\u0100\uffff",
683 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000684 ]
685 )
686
Walter Dörwalde22d3392005-11-17 08:52:34 +0000687 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200688 tests = [
689 (b'\xff', '\ufffd'),
690 (b'A\x00Z', 'A\ufffd'),
691 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
692 (b'\x00\xd8', '\ufffd'),
693 (b'\x00\xd8A', '\ufffd'),
694 (b'\x00\xd8A\x00', '\ufffdA'),
695 (b'\x00\xdcA\x00', '\ufffdA'),
696 ]
697 for raw, expected in tests:
698 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
699 raw, 'strict', True)
700 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000701
Victor Stinner53a9dd72010-12-08 22:25:45 +0000702 def test_nonbmp(self):
703 self.assertEqual("\U00010203".encode(self.encoding),
704 b'\x00\xd8\x03\xde')
705 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
706 "\U00010203")
707
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200708class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000709 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200710 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000711
712 def test_partial(self):
713 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200714 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000715 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000716 "",
717 "\x00",
718 "\x00",
719 "\x00\xff",
720 "\x00\xff",
721 "\x00\xff\u0100",
722 "\x00\xff\u0100",
723 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200724 "\x00\xff\u0100\uffff",
725 "\x00\xff\u0100\uffff",
726 "\x00\xff\u0100\uffff",
727 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000728 ]
729 )
730
Walter Dörwalde22d3392005-11-17 08:52:34 +0000731 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200732 tests = [
733 (b'\xff', '\ufffd'),
734 (b'\x00A\xff', 'A\ufffd'),
735 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
736 (b'\xd8\x00', '\ufffd'),
737 (b'\xd8\x00\xdc', '\ufffd'),
738 (b'\xd8\x00\x00A', '\ufffdA'),
739 (b'\xdc\x00\x00A', '\ufffdA'),
740 ]
741 for raw, expected in tests:
742 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
743 raw, 'strict', True)
744 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745
Victor Stinner53a9dd72010-12-08 22:25:45 +0000746 def test_nonbmp(self):
747 self.assertEqual("\U00010203".encode(self.encoding),
748 b'\xd8\x00\xde\x03')
749 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
750 "\U00010203")
751
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200752class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000753 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200754 ill_formed_sequence = b"\xed\xb2\x80"
755 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000756
757 def test_partial(self):
758 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200759 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000760 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000761 "\x00",
762 "\x00",
763 "\x00\xff",
764 "\x00\xff",
765 "\x00\xff\u07ff",
766 "\x00\xff\u07ff",
767 "\x00\xff\u07ff",
768 "\x00\xff\u07ff\u0800",
769 "\x00\xff\u07ff\u0800",
770 "\x00\xff\u07ff\u0800",
771 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200772 "\x00\xff\u07ff\u0800\uffff",
773 "\x00\xff\u07ff\u0800\uffff",
774 "\x00\xff\u07ff\u0800\uffff",
775 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000776 ]
777 )
778
Walter Dörwald3abcb012007-04-16 22:10:50 +0000779 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000780 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000781 self.check_state_handling_decode(self.encoding,
782 u, u.encode(self.encoding))
783
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000784 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200785 super().test_lone_surrogates()
786 # not sure if this is making sense for
787 # UTF-16 and UTF-32
788 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000789 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000790
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000791 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000792 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
793 b"abc\xed\xa0\x80def")
794 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
795 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
797 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
798 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
799 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000800 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700801 with self.assertRaises(UnicodeDecodeError):
802 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200803 with self.assertRaises(UnicodeDecodeError):
804 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000805
Victor Stinnerf96418d2015-09-21 23:06:27 +0200806
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200807@unittest.skipUnless(sys.platform == 'win32',
808 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200809class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200810 encoding = "cp65001"
811
812 def test_encode(self):
813 tests = [
814 ('abc', 'strict', b'abc'),
815 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
816 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
817 ]
818 if VISTA_OR_LATER:
819 tests.extend((
820 ('\udc80', 'strict', None),
821 ('\udc80', 'ignore', b''),
822 ('\udc80', 'replace', b'?'),
823 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200824 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200825 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
826 ))
827 else:
828 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
829 for text, errors, expected in tests:
830 if expected is not None:
831 try:
832 encoded = text.encode('cp65001', errors)
833 except UnicodeEncodeError as err:
834 self.fail('Unable to encode %a to cp65001 with '
835 'errors=%r: %s' % (text, errors, err))
836 self.assertEqual(encoded, expected,
837 '%a.encode("cp65001", %r)=%a != %a'
838 % (text, errors, encoded, expected))
839 else:
840 self.assertRaises(UnicodeEncodeError,
841 text.encode, "cp65001", errors)
842
843 def test_decode(self):
844 tests = [
845 (b'abc', 'strict', 'abc'),
846 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
847 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
848 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
849 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
850 # invalid bytes
851 (b'[\xff]', 'strict', None),
852 (b'[\xff]', 'ignore', '[]'),
853 (b'[\xff]', 'replace', '[\ufffd]'),
854 (b'[\xff]', 'surrogateescape', '[\udcff]'),
855 ]
856 if VISTA_OR_LATER:
857 tests.extend((
858 (b'[\xed\xb2\x80]', 'strict', None),
859 (b'[\xed\xb2\x80]', 'ignore', '[]'),
860 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
861 ))
862 else:
863 tests.extend((
864 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
865 ))
866 for raw, errors, expected in tests:
867 if expected is not None:
868 try:
869 decoded = raw.decode('cp65001', errors)
870 except UnicodeDecodeError as err:
871 self.fail('Unable to decode %a from cp65001 with '
872 'errors=%r: %s' % (raw, errors, err))
873 self.assertEqual(decoded, expected,
874 '%a.decode("cp65001", %r)=%a != %a'
875 % (raw, errors, decoded, expected))
876 else:
877 self.assertRaises(UnicodeDecodeError,
878 raw.decode, 'cp65001', errors)
879
880 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
881 def test_lone_surrogates(self):
882 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
883 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
884 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
885 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200886 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
887 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200888 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
889 b'[&#56448;]')
890 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
891 b'[\x80]')
892 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
893 b'[]')
894 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
895 b'[?]')
896
897 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
898 def test_surrogatepass_handler(self):
899 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
900 b"abc\xed\xa0\x80def")
901 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
902 "abc\ud800def")
903 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
904 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
905 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
906 "\U00010fff\uD800")
907 self.assertTrue(codecs.lookup_error("surrogatepass"))
908
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200909
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200910class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000911 encoding = "utf-7"
912
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000913 def test_partial(self):
914 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200915 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000916 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200917 'a',
918 'a',
919 'a+',
920 'a+-',
921 'a+-b',
922 'a+-b',
923 'a+-b',
924 'a+-b',
925 'a+-b',
926 'a+-b\x00',
927 'a+-b\x00c',
928 'a+-b\x00c',
929 'a+-b\x00c',
930 'a+-b\x00c',
931 'a+-b\x00c',
932 'a+-b\x00c\x80',
933 'a+-b\x00c\x80d',
934 'a+-b\x00c\x80d',
935 'a+-b\x00c\x80d',
936 'a+-b\x00c\x80d',
937 'a+-b\x00c\x80d',
938 'a+-b\x00c\x80d\u0100',
939 'a+-b\x00c\x80d\u0100e',
940 'a+-b\x00c\x80d\u0100e',
941 'a+-b\x00c\x80d\u0100e',
942 'a+-b\x00c\x80d\u0100e',
943 'a+-b\x00c\x80d\u0100e',
944 'a+-b\x00c\x80d\u0100e',
945 'a+-b\x00c\x80d\u0100e',
946 'a+-b\x00c\x80d\u0100e',
947 'a+-b\x00c\x80d\u0100e\U00010000',
948 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000949 ]
950 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000951
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300952 def test_errors(self):
953 tests = [
954 (b'a\xffb', 'a\ufffdb'),
955 (b'a+IK', 'a\ufffd'),
956 (b'a+IK-b', 'a\ufffdb'),
957 (b'a+IK,b', 'a\ufffdb'),
958 (b'a+IKx', 'a\u20ac\ufffd'),
959 (b'a+IKx-b', 'a\u20ac\ufffdb'),
960 (b'a+IKwgr', 'a\u20ac\ufffd'),
961 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
962 (b'a+IKwgr,', 'a\u20ac\ufffd'),
963 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
964 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
965 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
966 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
967 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
968 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
969 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
970 ]
971 for raw, expected in tests:
972 with self.subTest(raw=raw):
973 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
974 raw, 'strict', True)
975 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
976
977 def test_nonbmp(self):
978 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
979 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
980 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
981
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200982 test_lone_surrogates = None
983
984
Walter Dörwalde22d3392005-11-17 08:52:34 +0000985class UTF16ExTest(unittest.TestCase):
986
987 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000988 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000989
990 def test_bad_args(self):
991 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
992
993class ReadBufferTest(unittest.TestCase):
994
995 def test_array(self):
996 import array
997 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000998 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000999 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001000 )
1001
1002 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001003 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001004
1005 def test_bad_args(self):
1006 self.assertRaises(TypeError, codecs.readbuffer_encode)
1007 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1008
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001009class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001010 encoding = "utf-8-sig"
1011
1012 def test_partial(self):
1013 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001014 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001015 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001016 "",
1017 "",
1018 "", # First BOM has been read and skipped
1019 "",
1020 "",
1021 "\ufeff", # Second BOM has been read and emitted
1022 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001023 "\ufeff\x00", # First byte of encoded "\xff" read
1024 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1025 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1026 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001027 "\ufeff\x00\xff\u07ff",
1028 "\ufeff\x00\xff\u07ff",
1029 "\ufeff\x00\xff\u07ff\u0800",
1030 "\ufeff\x00\xff\u07ff\u0800",
1031 "\ufeff\x00\xff\u07ff\u0800",
1032 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001033 "\ufeff\x00\xff\u07ff\u0800\uffff",
1034 "\ufeff\x00\xff\u07ff\u0800\uffff",
1035 "\ufeff\x00\xff\u07ff\u0800\uffff",
1036 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001037 ]
1038 )
1039
Thomas Wouters89f507f2006-12-13 04:49:30 +00001040 def test_bug1601501(self):
1041 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001042 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001043
Walter Dörwald3abcb012007-04-16 22:10:50 +00001044 def test_bom(self):
1045 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001046 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001047 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1048
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001049 def test_stream_bom(self):
1050 unistring = "ABC\u00A1\u2200XYZ"
1051 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1052
1053 reader = codecs.getreader("utf-8-sig")
1054 for sizehint in [None] + list(range(1, 11)) + \
1055 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001056 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001057 ostream = io.StringIO()
1058 while 1:
1059 if sizehint is not None:
1060 data = istream.read(sizehint)
1061 else:
1062 data = istream.read()
1063
1064 if not data:
1065 break
1066 ostream.write(data)
1067
1068 got = ostream.getvalue()
1069 self.assertEqual(got, unistring)
1070
1071 def test_stream_bare(self):
1072 unistring = "ABC\u00A1\u2200XYZ"
1073 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1074
1075 reader = codecs.getreader("utf-8-sig")
1076 for sizehint in [None] + list(range(1, 11)) + \
1077 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001078 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001079 ostream = io.StringIO()
1080 while 1:
1081 if sizehint is not None:
1082 data = istream.read(sizehint)
1083 else:
1084 data = istream.read()
1085
1086 if not data:
1087 break
1088 ostream.write(data)
1089
1090 got = ostream.getvalue()
1091 self.assertEqual(got, unistring)
1092
1093class EscapeDecodeTest(unittest.TestCase):
1094 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001095 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001096 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001097
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001098 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001099 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001100 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001101 b = bytes([b])
1102 if b != b'\\':
1103 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001104
1105 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001106 decode = codecs.escape_decode
1107 check = coding_checker(self, decode)
1108 check(b"[\\\n]", b"[]")
1109 check(br'[\"]', b'["]')
1110 check(br"[\']", b"[']")
1111 check(br"[\\]", br"[\]")
1112 check(br"[\a]", b"[\x07]")
1113 check(br"[\b]", b"[\x08]")
1114 check(br"[\t]", b"[\x09]")
1115 check(br"[\n]", b"[\x0a]")
1116 check(br"[\v]", b"[\x0b]")
1117 check(br"[\f]", b"[\x0c]")
1118 check(br"[\r]", b"[\x0d]")
1119 check(br"[\7]", b"[\x07]")
1120 check(br"[\8]", br"[\8]")
1121 check(br"[\78]", b"[\x078]")
1122 check(br"[\41]", b"[!]")
1123 check(br"[\418]", b"[!8]")
1124 check(br"[\101]", b"[A]")
1125 check(br"[\1010]", b"[A0]")
1126 check(br"[\501]", b"[A]")
1127 check(br"[\x41]", b"[A]")
1128 check(br"[\X41]", br"[\X41]")
1129 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001130 for b in range(256):
1131 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001132 b = bytes([b])
1133 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001134
1135 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001136 decode = codecs.escape_decode
1137 self.assertRaises(ValueError, decode, br"\x")
1138 self.assertRaises(ValueError, decode, br"[\x]")
1139 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1140 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1141 self.assertRaises(ValueError, decode, br"\x0")
1142 self.assertRaises(ValueError, decode, br"[\x0]")
1143 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1144 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001145
Victor Stinnerf96418d2015-09-21 23:06:27 +02001146
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001147class RecodingTest(unittest.TestCase):
1148 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001149 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001150 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001151 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001152 f2.close()
1153 # Python used to crash on this at exit because of a refcount
1154 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001155
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001156 self.assertTrue(f.closed)
1157
Martin v. Löwis2548c732003-04-18 10:39:54 +00001158# From RFC 3492
1159punycode_testcases = [
1160 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1162 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001163 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001165 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001166 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001167 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001169 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001171 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1172 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1173 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001174 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001176 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1177 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1178 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001179 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001180 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001181 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001182 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1183 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1184 "\u0939\u0948\u0902",
1185 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001186
1187 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001188 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001189 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1190 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191
1192 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001193 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1194 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1195 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001196 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1197 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001198
1199 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001200 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1201 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1202 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1203 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001204 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205
1206 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001207 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1208 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1209 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1210 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1211 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001212 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213
1214 # (K) Vietnamese:
1215 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1216 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001217 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1218 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1219 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1220 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001221 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001224 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001225 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001226
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001228 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1229 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1230 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001231 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001232
1233 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001234 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1235 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1236 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001237 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238
1239 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001240 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001241 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242
1243 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001244 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1245 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001246 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247
1248 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001249 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001250 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251
1252 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001253 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001254 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001255
1256 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001257 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1258 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001259 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 ]
1261
1262for i in punycode_testcases:
1263 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001264 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001265
Victor Stinnerf96418d2015-09-21 23:06:27 +02001266
Martin v. Löwis2548c732003-04-18 10:39:54 +00001267class PunycodeTest(unittest.TestCase):
1268 def test_encode(self):
1269 for uni, puny in punycode_testcases:
1270 # Need to convert both strings to lower case, since
1271 # some of the extended encodings use upper case, but our
1272 # code produces only lower case. Converting just puny to
1273 # lower is also insufficient, since some of the input characters
1274 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001275 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001276 str(uni.encode("punycode"), "ascii").lower(),
1277 str(puny, "ascii").lower()
1278 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001279
1280 def test_decode(self):
1281 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001282 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001283 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001284 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001285
Victor Stinnerf96418d2015-09-21 23:06:27 +02001286
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001287class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001288 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001289 def test_bug1251300(self):
1290 # Decoding with unicode_internal used to not correctly handle "code
1291 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001292 ok = [
1293 (b"\x00\x10\xff\xff", "\U0010ffff"),
1294 (b"\x00\x00\x01\x01", "\U00000101"),
1295 (b"", ""),
1296 ]
1297 not_ok = [
1298 b"\x7f\xff\xff\xff",
1299 b"\x80\x00\x00\x00",
1300 b"\x81\x00\x00\x00",
1301 b"\x00",
1302 b"\x00\x00\x00\x00\x00",
1303 ]
1304 for internal, uni in ok:
1305 if sys.byteorder == "little":
1306 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001307 with support.check_warnings():
1308 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001309 for internal in not_ok:
1310 if sys.byteorder == "little":
1311 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001312 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001313 'deprecated', DeprecationWarning)):
1314 self.assertRaises(UnicodeDecodeError, internal.decode,
1315 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001316 if sys.byteorder == "little":
1317 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001318 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001319 else:
1320 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001321 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001322 with support.check_warnings():
1323 self.assertRaises(UnicodeDecodeError,
1324 invalid.decode, "unicode_internal")
1325 with support.check_warnings():
1326 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1327 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001328 with support.check_warnings():
1329 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1330 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001331
Victor Stinner182d90d2011-09-29 19:53:55 +02001332 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001333 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001334 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001335 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001336 'deprecated', DeprecationWarning)):
1337 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001338 except UnicodeDecodeError as ex:
1339 self.assertEqual("unicode_internal", ex.encoding)
1340 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1341 self.assertEqual(4, ex.start)
1342 self.assertEqual(8, ex.end)
1343 else:
1344 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001345
Victor Stinner182d90d2011-09-29 19:53:55 +02001346 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001347 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001348 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1349 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001350 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001351 'deprecated', DeprecationWarning)):
1352 ab = "ab".encode("unicode_internal").decode()
1353 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1354 "ascii"),
1355 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001356 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001357
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001358 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001359 with support.check_warnings(('unicode_internal codec has been '
1360 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001361 # Issue 3739
1362 encoder = codecs.getencoder("unicode_internal")
1363 self.assertEqual(encoder("a")[1], 1)
1364 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1365
1366 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001367
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1369nameprep_tests = [
1370 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1372 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1373 b'\xb8\x8f\xef\xbb\xbf',
1374 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'CAFE',
1377 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.3 Case folding 8bit U+00DF (german sharp s).
1379 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xc3\x9f',
1381 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xc4\xb0',
1384 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xc5\x83\xcd\xba',
1387 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1389 # XXX: skip this as it fails in UCS-2 mode
1390 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1391 # 'telc\xe2\x88\x95kg\xcf\x83'),
1392 (None, None),
1393 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1395 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'\xe1\xbe\xb7',
1398 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 # 3.9 Self-reverting case folding U+01F0 and normalization.
1400 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xc7\xb0',
1402 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xce\x90',
1405 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xce\xb0',
1408 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xe1\xba\x96',
1411 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xe1\xbd\x96',
1414 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b' ',
1417 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xc2\xa0',
1420 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 None),
1424 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xe2\x80\x80',
1426 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xe2\x80\x8b',
1429 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xe3\x80\x80',
1432 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\x10\x7f',
1435 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xef\xbb\xbf',
1444 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 None),
1454 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 None),
1469 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xcd\x81',
1474 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001477 None),
1478 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 None),
1481 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001483 None),
1484 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001485 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 None),
1487 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 None),
1490 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 None),
1493 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'foo\xef\xb9\xb6bar',
1495 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001496 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001497 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 None),
1499 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 (b'\xd8\xa71\xd8\xa8',
1501 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001502 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001503 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001505 # None),
1506 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 # 3.44 Larger test (shrinking).
1508 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001509 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1510 b'\xaa\xce\xb0\xe2\x80\x80',
1511 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001512 # 3.45 Larger test (expanding).
1513 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001514 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1515 b'\x80',
1516 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1517 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1518 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519 ]
1520
1521
1522class NameprepTest(unittest.TestCase):
1523 def test_nameprep(self):
1524 from encodings.idna import nameprep
1525 for pos, (orig, prepped) in enumerate(nameprep_tests):
1526 if orig is None:
1527 # Skipped
1528 continue
1529 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001530 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531 if prepped is None:
1532 # Input contains prohibited characters
1533 self.assertRaises(UnicodeError, nameprep, orig)
1534 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001535 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001536 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001538 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001539 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001540
Victor Stinnerf96418d2015-09-21 23:06:27 +02001541
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001542class IDNACodecTest(unittest.TestCase):
1543 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(str(b"python.org", "idna"), "python.org")
1545 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1546 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1547 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001548
1549 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual("python.org".encode("idna"), b"python.org")
1551 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1552 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1553 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001554
Martin v. Löwis8b595142005-08-25 11:03:38 +00001555 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001556 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001557 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001558 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001559
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001560 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001562 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001563 "python.org"
1564 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001565 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001566 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001567 "python.org."
1568 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001569 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001570 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571 "pyth\xf6n.org."
1572 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001573 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001574 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001575 "pyth\xf6n.org."
1576 )
1577
1578 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1580 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1581 self.assertEqual(decoder.decode(b"rg"), "")
1582 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001583
1584 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001585 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1586 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1587 self.assertEqual(decoder.decode(b"rg."), "org.")
1588 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001589
1590 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001591 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001592 b"".join(codecs.iterencode("python.org", "idna")),
1593 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001594 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001595 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001596 b"".join(codecs.iterencode("python.org.", "idna")),
1597 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001598 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001599 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001600 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1601 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001602 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001603 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001604 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1605 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001606 )
1607
1608 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001609 self.assertEqual(encoder.encode("\xe4x"), b"")
1610 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1611 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001612
1613 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001614 self.assertEqual(encoder.encode("\xe4x"), b"")
1615 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1616 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001617
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001618 def test_errors(self):
1619 """Only supports "strict" error handler"""
1620 "python.org".encode("idna", "strict")
1621 b"python.org".decode("idna", "strict")
1622 for errors in ("ignore", "replace", "backslashreplace",
1623 "surrogateescape"):
1624 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1625 self.assertRaises(Exception,
1626 b"python.org".decode, "idna", errors)
1627
Victor Stinnerf96418d2015-09-21 23:06:27 +02001628
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001629class CodecsModuleTest(unittest.TestCase):
1630
1631 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1633 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001634 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001636 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001637
Victor Stinnera57dfd02014-05-14 17:13:14 +02001638 # test keywords
1639 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1640 '\xe4\xf6\xfc')
1641 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1642 '[]')
1643
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001644 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001645 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1646 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001647 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001648 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001649 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001650 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001651
Victor Stinnera57dfd02014-05-14 17:13:14 +02001652 # test keywords
1653 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1654 b'\xe4\xf6\xfc')
1655 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1656 b'[]')
1657
Walter Dörwald063e1e82004-10-28 13:04:26 +00001658 def test_register(self):
1659 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001660 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001661
1662 def test_lookup(self):
1663 self.assertRaises(TypeError, codecs.lookup)
1664 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001665 self.assertRaises(LookupError, codecs.lookup, " ")
1666
1667 def test_getencoder(self):
1668 self.assertRaises(TypeError, codecs.getencoder)
1669 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1670
1671 def test_getdecoder(self):
1672 self.assertRaises(TypeError, codecs.getdecoder)
1673 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1674
1675 def test_getreader(self):
1676 self.assertRaises(TypeError, codecs.getreader)
1677 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1678
1679 def test_getwriter(self):
1680 self.assertRaises(TypeError, codecs.getwriter)
1681 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001682
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001683 def test_lookup_issue1813(self):
1684 # Issue #1813: under Turkish locales, lookup of some codecs failed
1685 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001686 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001687 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1688 try:
1689 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1690 except locale.Error:
1691 # Unsupported locale on this system
1692 self.skipTest('test needs Turkish locale')
1693 c = codecs.lookup('ASCII')
1694 self.assertEqual(c.name, 'ascii')
1695
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001696 def test_all(self):
1697 api = (
1698 "encode", "decode",
1699 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1700 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1701 "getencoder", "getdecoder", "getincrementalencoder",
1702 "getincrementaldecoder", "getreader", "getwriter",
1703 "register_error", "lookup_error",
1704 "strict_errors", "replace_errors", "ignore_errors",
1705 "xmlcharrefreplace_errors", "backslashreplace_errors",
1706 "namereplace_errors",
1707 "open", "EncodedFile",
1708 "iterencode", "iterdecode",
1709 "BOM", "BOM_BE", "BOM_LE",
1710 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1711 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1712 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1713 "StreamReaderWriter", "StreamRecoder",
1714 )
1715 self.assertCountEqual(api, codecs.__all__)
1716 for api in codecs.__all__:
1717 getattr(codecs, api)
1718
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001719 def test_open(self):
1720 self.addCleanup(support.unlink, support.TESTFN)
1721 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1722 with self.subTest(mode), \
1723 codecs.open(support.TESTFN, mode, 'ascii') as file:
1724 self.assertIsInstance(file, codecs.StreamReaderWriter)
1725
1726 def test_undefined(self):
1727 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1728 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1729 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1730 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1731 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1732 self.assertRaises(UnicodeError,
1733 codecs.encode, 'abc', 'undefined', errors)
1734 self.assertRaises(UnicodeError,
1735 codecs.decode, b'abc', 'undefined', errors)
1736
Victor Stinnerf96418d2015-09-21 23:06:27 +02001737
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001738class StreamReaderTest(unittest.TestCase):
1739
1740 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001741 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001742 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001743
1744 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001745 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001746 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001747
Victor Stinnerf96418d2015-09-21 23:06:27 +02001748
Thomas Wouters89f507f2006-12-13 04:49:30 +00001749class EncodedFileTest(unittest.TestCase):
1750
1751 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001752 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001753 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001754 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001755
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001756 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001757 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001758 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001759 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001760
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001761all_unicode_encodings = [
1762 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001763 "big5",
1764 "big5hkscs",
1765 "charmap",
1766 "cp037",
1767 "cp1006",
1768 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001769 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001770 "cp1140",
1771 "cp1250",
1772 "cp1251",
1773 "cp1252",
1774 "cp1253",
1775 "cp1254",
1776 "cp1255",
1777 "cp1256",
1778 "cp1257",
1779 "cp1258",
1780 "cp424",
1781 "cp437",
1782 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001783 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001784 "cp737",
1785 "cp775",
1786 "cp850",
1787 "cp852",
1788 "cp855",
1789 "cp856",
1790 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001791 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001792 "cp860",
1793 "cp861",
1794 "cp862",
1795 "cp863",
1796 "cp864",
1797 "cp865",
1798 "cp866",
1799 "cp869",
1800 "cp874",
1801 "cp875",
1802 "cp932",
1803 "cp949",
1804 "cp950",
1805 "euc_jis_2004",
1806 "euc_jisx0213",
1807 "euc_jp",
1808 "euc_kr",
1809 "gb18030",
1810 "gb2312",
1811 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001812 "hp_roman8",
1813 "hz",
1814 "idna",
1815 "iso2022_jp",
1816 "iso2022_jp_1",
1817 "iso2022_jp_2",
1818 "iso2022_jp_2004",
1819 "iso2022_jp_3",
1820 "iso2022_jp_ext",
1821 "iso2022_kr",
1822 "iso8859_1",
1823 "iso8859_10",
1824 "iso8859_11",
1825 "iso8859_13",
1826 "iso8859_14",
1827 "iso8859_15",
1828 "iso8859_16",
1829 "iso8859_2",
1830 "iso8859_3",
1831 "iso8859_4",
1832 "iso8859_5",
1833 "iso8859_6",
1834 "iso8859_7",
1835 "iso8859_8",
1836 "iso8859_9",
1837 "johab",
1838 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001839 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001840 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001841 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001842 "latin_1",
1843 "mac_cyrillic",
1844 "mac_greek",
1845 "mac_iceland",
1846 "mac_latin2",
1847 "mac_roman",
1848 "mac_turkish",
1849 "palmos",
1850 "ptcp154",
1851 "punycode",
1852 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001853 "shift_jis",
1854 "shift_jis_2004",
1855 "shift_jisx0213",
1856 "tis_620",
1857 "unicode_escape",
1858 "unicode_internal",
1859 "utf_16",
1860 "utf_16_be",
1861 "utf_16_le",
1862 "utf_7",
1863 "utf_8",
1864]
1865
1866if hasattr(codecs, "mbcs_encode"):
1867 all_unicode_encodings.append("mbcs")
1868
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001869# The following encoding is not tested, because it's not supposed
1870# to work:
1871# "undefined"
1872
1873# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001874broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001875 "punycode",
1876 "unicode_internal"
1877]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001878
Victor Stinnerf96418d2015-09-21 23:06:27 +02001879
Walter Dörwald3abcb012007-04-16 22:10:50 +00001880class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001881 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001882 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001883 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001884 name = codecs.lookup(encoding).name
1885 if encoding.endswith("_codec"):
1886 name += "_codec"
1887 elif encoding == "latin_1":
1888 name = "latin_1"
1889 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001890
Ezio Melottiadc417c2011-11-17 12:23:34 +02001891 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001892 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001893 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001894 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001895 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001896 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001898 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001899 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001900 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001901 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001902 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001903 for c in s:
1904 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001905 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001906 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001907 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001908 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001909 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001910 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001911 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001912 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001913 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001914 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001915
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001916 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001917 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001918 try:
1919 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001920 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001921 pass
1922 else:
1923 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001924 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001925 for c in s:
1926 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001927 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001928 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001929 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001930 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001931 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001932 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001933 self.assertEqual(decodedresult, s,
1934 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001935
1936 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001937 result = "".join(codecs.iterdecode(
1938 codecs.iterencode(s, encoding), encoding))
1939 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001940
1941 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001942 result = "".join(codecs.iterdecode(
1943 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001944 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001945
Victor Stinner554f3f02010-06-16 23:33:54 +00001946 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001947 # check incremental decoder/encoder with errors argument
1948 try:
1949 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001950 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001951 pass
1952 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001953 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001954 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001955 decodedresult = "".join(decoder.decode(bytes([c]))
1956 for c in encodedresult)
1957 self.assertEqual(decodedresult, s,
1958 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001959
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001960 @support.cpython_only
1961 def test_basics_capi(self):
1962 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1963 s = "abc123" # all codecs should be able to encode these
1964 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001965 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001966 # check incremental decoder/encoder (fetched via the C API)
1967 try:
1968 cencoder = codec_incrementalencoder(encoding)
1969 except LookupError: # no IncrementalEncoder
1970 pass
1971 else:
1972 # check C API
1973 encodedresult = b""
1974 for c in s:
1975 encodedresult += cencoder.encode(c)
1976 encodedresult += cencoder.encode("", True)
1977 cdecoder = codec_incrementaldecoder(encoding)
1978 decodedresult = ""
1979 for c in encodedresult:
1980 decodedresult += cdecoder.decode(bytes([c]))
1981 decodedresult += cdecoder.decode(b"", True)
1982 self.assertEqual(decodedresult, s,
1983 "encoding=%r" % encoding)
1984
1985 if encoding not in ("idna", "mbcs"):
1986 # check incremental decoder/encoder with errors argument
1987 try:
1988 cencoder = codec_incrementalencoder(encoding, "ignore")
1989 except LookupError: # no IncrementalEncoder
1990 pass
1991 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001992 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001993 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1994 decodedresult = "".join(cdecoder.decode(bytes([c]))
1995 for c in encodedresult)
1996 self.assertEqual(decodedresult, s,
1997 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001998
Walter Dörwald729c31f2005-03-14 19:06:30 +00001999 def test_seek(self):
2000 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002001 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002002 for encoding in all_unicode_encodings:
2003 if encoding == "idna": # FIXME: See SF bug #1163178
2004 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002005 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002006 continue
Victor Stinner05010702011-05-27 16:50:40 +02002007 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002008 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002009 # Test that calling seek resets the internal codec state and buffers
2010 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002011 data = reader.read()
2012 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002013
Walter Dörwalde22d3392005-11-17 08:52:34 +00002014 def test_bad_decode_args(self):
2015 for encoding in all_unicode_encodings:
2016 decoder = codecs.getdecoder(encoding)
2017 self.assertRaises(TypeError, decoder)
2018 if encoding not in ("idna", "punycode"):
2019 self.assertRaises(TypeError, decoder, 42)
2020
2021 def test_bad_encode_args(self):
2022 for encoding in all_unicode_encodings:
2023 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002024 with support.check_warnings():
2025 # unicode-internal has been deprecated
2026 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002027
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002028 def test_encoding_map_type_initialized(self):
2029 from encodings import cp1140
2030 # This used to crash, we are only verifying there's no crash.
2031 table_type = type(cp1140.encoding_table)
2032 self.assertEqual(table_type, table_type)
2033
Walter Dörwald3abcb012007-04-16 22:10:50 +00002034 def test_decoder_state(self):
2035 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002036 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002037 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002038 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002039 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2040 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2041
Victor Stinnerf96418d2015-09-21 23:06:27 +02002042
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002043class CharmapTest(unittest.TestCase):
2044 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002045 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002046 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002047 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002048 )
2049
Ezio Melottib3aedd42010-11-20 19:04:17 +00002050 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002051 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2052 ("\U0010FFFFbc", 3)
2053 )
2054
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002055 self.assertRaises(UnicodeDecodeError,
2056 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2057 )
2058
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002059 self.assertRaises(UnicodeDecodeError,
2060 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2061 )
2062
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002063 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002064 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002065 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002066 )
2067
Ezio Melottib3aedd42010-11-20 19:04:17 +00002068 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002069 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002070 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002071 )
2072
Ezio Melottib3aedd42010-11-20 19:04:17 +00002073 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002074 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2075 ("ab\\x02", 3)
2076 )
2077
2078 self.assertEqual(
2079 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2080 ("ab\\x02", 3)
2081 )
2082
2083 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002084 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002085 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002086 )
2087
Ezio Melottib3aedd42010-11-20 19:04:17 +00002088 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002089 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002090 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002091 )
2092
Guido van Rossum805365e2007-05-07 22:24:25 +00002093 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002094 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002095 codecs.charmap_decode(allbytes, "ignore", ""),
2096 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002097 )
2098
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002099 def test_decode_with_int2str_map(self):
2100 self.assertEqual(
2101 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2102 {0: 'a', 1: 'b', 2: 'c'}),
2103 ("abc", 3)
2104 )
2105
2106 self.assertEqual(
2107 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2108 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2109 ("AaBbCc", 3)
2110 )
2111
2112 self.assertEqual(
2113 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2114 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2115 ("\U0010FFFFbc", 3)
2116 )
2117
2118 self.assertEqual(
2119 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2120 {0: 'a', 1: 'b', 2: ''}),
2121 ("ab", 3)
2122 )
2123
2124 self.assertRaises(UnicodeDecodeError,
2125 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2126 {0: 'a', 1: 'b'}
2127 )
2128
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002129 self.assertRaises(UnicodeDecodeError,
2130 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2131 {0: 'a', 1: 'b', 2: None}
2132 )
2133
2134 # Issue #14850
2135 self.assertRaises(UnicodeDecodeError,
2136 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2137 {0: 'a', 1: 'b', 2: '\ufffe'}
2138 )
2139
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002140 self.assertEqual(
2141 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2142 {0: 'a', 1: 'b'}),
2143 ("ab\ufffd", 3)
2144 )
2145
2146 self.assertEqual(
2147 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2148 {0: 'a', 1: 'b', 2: None}),
2149 ("ab\ufffd", 3)
2150 )
2151
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002152 # Issue #14850
2153 self.assertEqual(
2154 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2155 {0: 'a', 1: 'b', 2: '\ufffe'}),
2156 ("ab\ufffd", 3)
2157 )
2158
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002159 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002160 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2161 {0: 'a', 1: 'b'}),
2162 ("ab\\x02", 3)
2163 )
2164
2165 self.assertEqual(
2166 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2167 {0: 'a', 1: 'b', 2: None}),
2168 ("ab\\x02", 3)
2169 )
2170
2171 # Issue #14850
2172 self.assertEqual(
2173 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2174 {0: 'a', 1: 'b', 2: '\ufffe'}),
2175 ("ab\\x02", 3)
2176 )
2177
2178 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002179 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2180 {0: 'a', 1: 'b'}),
2181 ("ab", 3)
2182 )
2183
2184 self.assertEqual(
2185 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2186 {0: 'a', 1: 'b', 2: None}),
2187 ("ab", 3)
2188 )
2189
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002190 # Issue #14850
2191 self.assertEqual(
2192 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2193 {0: 'a', 1: 'b', 2: '\ufffe'}),
2194 ("ab", 3)
2195 )
2196
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002197 allbytes = bytes(range(256))
2198 self.assertEqual(
2199 codecs.charmap_decode(allbytes, "ignore", {}),
2200 ("", len(allbytes))
2201 )
2202
2203 def test_decode_with_int2int_map(self):
2204 a = ord('a')
2205 b = ord('b')
2206 c = ord('c')
2207
2208 self.assertEqual(
2209 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2210 {0: a, 1: b, 2: c}),
2211 ("abc", 3)
2212 )
2213
2214 # Issue #15379
2215 self.assertEqual(
2216 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2217 {0: 0x10FFFF, 1: b, 2: c}),
2218 ("\U0010FFFFbc", 3)
2219 )
2220
Antoine Pitroua1f76552012-09-23 20:00:04 +02002221 self.assertEqual(
2222 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2223 {0: sys.maxunicode, 1: b, 2: c}),
2224 (chr(sys.maxunicode) + "bc", 3)
2225 )
2226
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002227 self.assertRaises(TypeError,
2228 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002229 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002230 )
2231
2232 self.assertRaises(UnicodeDecodeError,
2233 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2234 {0: a, 1: b},
2235 )
2236
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002237 self.assertRaises(UnicodeDecodeError,
2238 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2239 {0: a, 1: b, 2: 0xFFFE},
2240 )
2241
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002242 self.assertEqual(
2243 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2244 {0: a, 1: b}),
2245 ("ab\ufffd", 3)
2246 )
2247
2248 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002249 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2250 {0: a, 1: b, 2: 0xFFFE}),
2251 ("ab\ufffd", 3)
2252 )
2253
2254 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002255 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2256 {0: a, 1: b}),
2257 ("ab\\x02", 3)
2258 )
2259
2260 self.assertEqual(
2261 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2262 {0: a, 1: b, 2: 0xFFFE}),
2263 ("ab\\x02", 3)
2264 )
2265
2266 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002267 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2268 {0: a, 1: b}),
2269 ("ab", 3)
2270 )
2271
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002272 self.assertEqual(
2273 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2274 {0: a, 1: b, 2: 0xFFFE}),
2275 ("ab", 3)
2276 )
2277
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002278
Thomas Wouters89f507f2006-12-13 04:49:30 +00002279class WithStmtTest(unittest.TestCase):
2280 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002281 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002282 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2283 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002284 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002285
2286 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002287 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002288 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002289 with codecs.StreamReaderWriter(f, info.streamreader,
2290 info.streamwriter, 'strict') as srw:
2291 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002292
Victor Stinnerf96418d2015-09-21 23:06:27 +02002293
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002294class TypesTest(unittest.TestCase):
2295 def test_decode_unicode(self):
2296 # Most decoders don't accept unicode input
2297 decoders = [
2298 codecs.utf_7_decode,
2299 codecs.utf_8_decode,
2300 codecs.utf_16_le_decode,
2301 codecs.utf_16_be_decode,
2302 codecs.utf_16_ex_decode,
2303 codecs.utf_32_decode,
2304 codecs.utf_32_le_decode,
2305 codecs.utf_32_be_decode,
2306 codecs.utf_32_ex_decode,
2307 codecs.latin_1_decode,
2308 codecs.ascii_decode,
2309 codecs.charmap_decode,
2310 ]
2311 if hasattr(codecs, "mbcs_decode"):
2312 decoders.append(codecs.mbcs_decode)
2313 for decoder in decoders:
2314 self.assertRaises(TypeError, decoder, "xxx")
2315
2316 def test_unicode_escape(self):
2317 # Escape-decoding an unicode string is supported ang gives the same
2318 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002319 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2320 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2321 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2322 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002323
Victor Stinnere3b47152011-12-09 20:49:49 +01002324 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2325 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002326 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2327 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002328
2329 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2330 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002331 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2332 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002333
Serhiy Storchakad6793772013-01-29 10:20:44 +02002334
2335class UnicodeEscapeTest(unittest.TestCase):
2336 def test_empty(self):
2337 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2338 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2339
2340 def test_raw_encode(self):
2341 encode = codecs.unicode_escape_encode
2342 for b in range(32, 127):
2343 if b != b'\\'[0]:
2344 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2345
2346 def test_raw_decode(self):
2347 decode = codecs.unicode_escape_decode
2348 for b in range(256):
2349 if b != b'\\'[0]:
2350 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2351
2352 def test_escape_encode(self):
2353 encode = codecs.unicode_escape_encode
2354 check = coding_checker(self, encode)
2355 check('\t', br'\t')
2356 check('\n', br'\n')
2357 check('\r', br'\r')
2358 check('\\', br'\\')
2359 for b in range(32):
2360 if chr(b) not in '\t\n\r':
2361 check(chr(b), ('\\x%02x' % b).encode())
2362 for b in range(127, 256):
2363 check(chr(b), ('\\x%02x' % b).encode())
2364 check('\u20ac', br'\u20ac')
2365 check('\U0001d120', br'\U0001d120')
2366
2367 def test_escape_decode(self):
2368 decode = codecs.unicode_escape_decode
2369 check = coding_checker(self, decode)
2370 check(b"[\\\n]", "[]")
2371 check(br'[\"]', '["]')
2372 check(br"[\']", "[']")
2373 check(br"[\\]", r"[\]")
2374 check(br"[\a]", "[\x07]")
2375 check(br"[\b]", "[\x08]")
2376 check(br"[\t]", "[\x09]")
2377 check(br"[\n]", "[\x0a]")
2378 check(br"[\v]", "[\x0b]")
2379 check(br"[\f]", "[\x0c]")
2380 check(br"[\r]", "[\x0d]")
2381 check(br"[\7]", "[\x07]")
2382 check(br"[\8]", r"[\8]")
2383 check(br"[\78]", "[\x078]")
2384 check(br"[\41]", "[!]")
2385 check(br"[\418]", "[!8]")
2386 check(br"[\101]", "[A]")
2387 check(br"[\1010]", "[A0]")
2388 check(br"[\x41]", "[A]")
2389 check(br"[\x410]", "[A0]")
2390 check(br"\u20ac", "\u20ac")
2391 check(br"\U0001d120", "\U0001d120")
2392 for b in range(256):
2393 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2394 check(b'\\' + bytes([b]), '\\' + chr(b))
2395
2396 def test_decode_errors(self):
2397 decode = codecs.unicode_escape_decode
2398 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2399 for i in range(d):
2400 self.assertRaises(UnicodeDecodeError, decode,
2401 b"\\" + c + b"0"*i)
2402 self.assertRaises(UnicodeDecodeError, decode,
2403 b"[\\" + c + b"0"*i + b"]")
2404 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2405 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2406 self.assertEqual(decode(data, "replace"),
2407 ("[\ufffd]\ufffd", len(data)))
2408 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2409 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2410 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2411
2412
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002413class RawUnicodeEscapeTest(unittest.TestCase):
2414 def test_empty(self):
2415 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2416 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2417
2418 def test_raw_encode(self):
2419 encode = codecs.raw_unicode_escape_encode
2420 for b in range(256):
2421 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2422
2423 def test_raw_decode(self):
2424 decode = codecs.raw_unicode_escape_decode
2425 for b in range(256):
2426 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2427
2428 def test_escape_encode(self):
2429 encode = codecs.raw_unicode_escape_encode
2430 check = coding_checker(self, encode)
2431 for b in range(256):
2432 if b not in b'uU':
2433 check('\\' + chr(b), b'\\' + bytes([b]))
2434 check('\u20ac', br'\u20ac')
2435 check('\U0001d120', br'\U0001d120')
2436
2437 def test_escape_decode(self):
2438 decode = codecs.raw_unicode_escape_decode
2439 check = coding_checker(self, decode)
2440 for b in range(256):
2441 if b not in b'uU':
2442 check(b'\\' + bytes([b]), '\\' + chr(b))
2443 check(br"\u20ac", "\u20ac")
2444 check(br"\U0001d120", "\U0001d120")
2445
2446 def test_decode_errors(self):
2447 decode = codecs.raw_unicode_escape_decode
2448 for c, d in (b'u', 4), (b'U', 4):
2449 for i in range(d):
2450 self.assertRaises(UnicodeDecodeError, decode,
2451 b"\\" + c + b"0"*i)
2452 self.assertRaises(UnicodeDecodeError, decode,
2453 b"[\\" + c + b"0"*i + b"]")
2454 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2455 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2456 self.assertEqual(decode(data, "replace"),
2457 ("[\ufffd]\ufffd", len(data)))
2458 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2459 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2460 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2461
2462
Martin v. Löwis43c57782009-05-10 08:15:24 +00002463class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002464
2465 def test_utf8(self):
2466 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002467 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002468 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002469 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002470 b"foo\x80bar")
2471 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002472 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002473 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002474 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002475 b"\xed\xb0\x80")
2476
2477 def test_ascii(self):
2478 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002479 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002480 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002481 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002482 b"foo\x80bar")
2483
2484 def test_charmap(self):
2485 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002486 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002487 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002488 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002489 b"foo\xa5bar")
2490
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002491 def test_latin1(self):
2492 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002493 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002494 b"\xe4\xeb\xef\xf6\xfc")
2495
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002496
Victor Stinner3fed0872010-05-22 02:16:27 +00002497class BomTest(unittest.TestCase):
2498 def test_seek0(self):
2499 data = "1234567890"
2500 tests = ("utf-16",
2501 "utf-16-le",
2502 "utf-16-be",
2503 "utf-32",
2504 "utf-32-le",
2505 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002506 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002507 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002508 # Check if the BOM is written only once
2509 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002510 f.write(data)
2511 f.write(data)
2512 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002513 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002514 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002515 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002516
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002517 # Check that the BOM is written after a seek(0)
2518 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2519 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002520 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002521 f.seek(0)
2522 f.write(data)
2523 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002524 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002525
2526 # (StreamWriter) Check that the BOM is written after a seek(0)
2527 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002528 f.writer.write(data[0])
2529 self.assertNotEqual(f.writer.tell(), 0)
2530 f.writer.seek(0)
2531 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002532 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002533 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002534
Victor Stinner05010702011-05-27 16:50:40 +02002535 # Check that the BOM is not written after a seek() at a position
2536 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002537 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2538 f.write(data)
2539 f.seek(f.tell())
2540 f.write(data)
2541 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002542 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002543
Victor Stinner05010702011-05-27 16:50:40 +02002544 # (StreamWriter) Check that the BOM is not written after a seek()
2545 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002546 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002547 f.writer.write(data)
2548 f.writer.seek(f.writer.tell())
2549 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002550 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002551 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002552
Victor Stinner3fed0872010-05-22 02:16:27 +00002553
Georg Brandl02524622010-12-02 18:06:51 +00002554bytes_transform_encodings = [
2555 "base64_codec",
2556 "uu_codec",
2557 "quopri_codec",
2558 "hex_codec",
2559]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002560
2561transform_aliases = {
2562 "base64_codec": ["base64", "base_64"],
2563 "uu_codec": ["uu"],
2564 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2565 "hex_codec": ["hex"],
2566 "rot_13": ["rot13"],
2567}
2568
Georg Brandl02524622010-12-02 18:06:51 +00002569try:
2570 import zlib
2571except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002572 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002573else:
2574 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002575 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002576try:
2577 import bz2
2578except ImportError:
2579 pass
2580else:
2581 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002582 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002583
Victor Stinnerf96418d2015-09-21 23:06:27 +02002584
Georg Brandl02524622010-12-02 18:06:51 +00002585class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002586
Georg Brandl02524622010-12-02 18:06:51 +00002587 def test_basics(self):
2588 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002589 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002590 with self.subTest(encoding=encoding):
2591 # generic codecs interface
2592 (o, size) = codecs.getencoder(encoding)(binput)
2593 self.assertEqual(size, len(binput))
2594 (i, size) = codecs.getdecoder(encoding)(o)
2595 self.assertEqual(size, len(o))
2596 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002597
Georg Brandl02524622010-12-02 18:06:51 +00002598 def test_read(self):
2599 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002600 with self.subTest(encoding=encoding):
2601 sin = codecs.encode(b"\x80", encoding)
2602 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2603 sout = reader.read()
2604 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002605
2606 def test_readline(self):
2607 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002608 with self.subTest(encoding=encoding):
2609 sin = codecs.encode(b"\x80", encoding)
2610 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2611 sout = reader.readline()
2612 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002613
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002614 def test_buffer_api_usage(self):
2615 # We check all the transform codecs accept memoryview input
2616 # for encoding and decoding
2617 # and also that they roundtrip correctly
2618 original = b"12345\x80"
2619 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002620 with self.subTest(encoding=encoding):
2621 data = original
2622 view = memoryview(data)
2623 data = codecs.encode(data, encoding)
2624 view_encoded = codecs.encode(view, encoding)
2625 self.assertEqual(view_encoded, data)
2626 view = memoryview(data)
2627 data = codecs.decode(data, encoding)
2628 self.assertEqual(data, original)
2629 view_decoded = codecs.decode(view, encoding)
2630 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002631
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002632 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 # Check binary -> binary codecs give a good error for str input
2634 bad_input = "bad input type"
2635 for encoding in bytes_transform_encodings:
2636 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002637 fmt = ( "{!r} is not a text encoding; "
2638 "use codecs.encode\(\) to handle arbitrary codecs")
2639 msg = fmt.format(encoding)
2640 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002641 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002642 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002643
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002644 def test_text_to_binary_blacklists_text_transforms(self):
2645 # Check str.encode gives a good error message for str -> str codecs
2646 msg = (r"^'rot_13' is not a text encoding; "
2647 "use codecs.encode\(\) to handle arbitrary codecs")
2648 with self.assertRaisesRegex(LookupError, msg):
2649 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002650
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002651 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002652 # Check bytes.decode and bytearray.decode give a good error
2653 # message for binary -> binary codecs
2654 data = b"encode first to ensure we meet any format restrictions"
2655 for encoding in bytes_transform_encodings:
2656 with self.subTest(encoding=encoding):
2657 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002658 fmt = (r"{!r} is not a text encoding; "
2659 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002660 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002661 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002662 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002663 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002664 bytearray(encoded_data).decode(encoding)
2665
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002666 def test_binary_to_text_blacklists_text_transforms(self):
2667 # Check str -> str codec gives a good error for binary input
2668 for bad_input in (b"immutable", bytearray(b"mutable")):
2669 with self.subTest(bad_input=bad_input):
2670 msg = (r"^'rot_13' is not a text encoding; "
2671 "use codecs.decode\(\) to handle arbitrary codecs")
2672 with self.assertRaisesRegex(LookupError, msg) as failure:
2673 bad_input.decode("rot_13")
2674 self.assertIsNone(failure.exception.__cause__)
2675
Zachary Wareefa2e042013-12-30 14:54:11 -06002676 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002677 def test_custom_zlib_error_is_wrapped(self):
2678 # Check zlib codec gives a good error for malformed input
2679 msg = "^decoding with 'zlib_codec' codec failed"
2680 with self.assertRaisesRegex(Exception, msg) as failure:
2681 codecs.decode(b"hello", "zlib_codec")
2682 self.assertIsInstance(failure.exception.__cause__,
2683 type(failure.exception))
2684
2685 def test_custom_hex_error_is_wrapped(self):
2686 # Check hex codec gives a good error for malformed input
2687 msg = "^decoding with 'hex_codec' codec failed"
2688 with self.assertRaisesRegex(Exception, msg) as failure:
2689 codecs.decode(b"hello", "hex_codec")
2690 self.assertIsInstance(failure.exception.__cause__,
2691 type(failure.exception))
2692
2693 # Unfortunately, the bz2 module throws OSError, which the codec
2694 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002695
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002696 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2697 def test_aliases(self):
2698 for codec_name, aliases in transform_aliases.items():
2699 expected_name = codecs.lookup(codec_name).name
2700 for alias in aliases:
2701 with self.subTest(alias=alias):
2702 info = codecs.lookup(alias)
2703 self.assertEqual(info.name, expected_name)
2704
Martin Panter06171bd2015-09-12 00:34:28 +00002705 def test_quopri_stateless(self):
2706 # Should encode with quotetabs=True
2707 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2708 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2709 # But should still support unescaped tabs and spaces
2710 unescaped = b"space tab eol\n"
2711 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2712
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002713 def test_uu_invalid(self):
2714 # Missing "begin" line
2715 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2716
Nick Coghlan8b097b42013-11-13 23:49:21 +10002717
2718# The codec system tries to wrap exceptions in order to ensure the error
2719# mentions the operation being performed and the codec involved. We
2720# currently *only* want this to happen for relatively stateless
2721# exceptions, where the only significant information they contain is their
2722# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002723
2724# Use a local codec registry to avoid appearing to leak objects when
2725# registering multiple seach functions
2726_TEST_CODECS = {}
2727
2728def _get_test_codec(codec_name):
2729 return _TEST_CODECS.get(codec_name)
2730codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2731
Nick Coghlan8fad1672014-09-15 23:50:44 +12002732try:
2733 # Issue #22166: Also need to clear the internal cache in CPython
2734 from _codecs import _forget_codec
2735except ImportError:
2736 def _forget_codec(codec_name):
2737 pass
2738
2739
Nick Coghlan8b097b42013-11-13 23:49:21 +10002740class ExceptionChainingTest(unittest.TestCase):
2741
2742 def setUp(self):
2743 # There's no way to unregister a codec search function, so we just
2744 # ensure we render this one fairly harmless after the test
2745 # case finishes by using the test case repr as the codec name
2746 # The codecs module normalizes codec names, although this doesn't
2747 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002748 # We also make sure we use a truly unique id for the custom codec
2749 # to avoid issues with the codec cache when running these tests
2750 # multiple times (e.g. when hunting for refleaks)
2751 unique_id = repr(self) + str(id(self))
2752 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2753
2754 # We store the object to raise on the instance because of a bad
2755 # interaction between the codec caching (which means we can't
2756 # recreate the codec entry) and regrtest refleak hunting (which
2757 # runs the same test instance multiple times). This means we
2758 # need to ensure the codecs call back in to the instance to find
2759 # out which exception to raise rather than binding them in a
2760 # closure to an object that may change on the next run
2761 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002762
Nick Coghlan4e553e22013-11-16 00:35:34 +10002763 def tearDown(self):
2764 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002765 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2766 encodings._cache.pop(self.codec_name, None)
2767 try:
2768 _forget_codec(self.codec_name)
2769 except KeyError:
2770 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002771
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002772 def set_codec(self, encode, decode):
2773 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002774 name=self.codec_name)
2775 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002776
2777 @contextlib.contextmanager
2778 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002779 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002780 operation, self.codec_name, exc_type.__name__, msg)
2781 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2782 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002783 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002784 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002785
2786 def raise_obj(self, *args, **kwds):
2787 # Helper to dynamically change the object raised by a test codec
2788 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002789
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002790 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002791 self.obj_to_raise = obj_to_raise
2792 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002793 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002794 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002795 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002796 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002797 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002798 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002799 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002800 codecs.decode(b"bytes input", self.codec_name)
2801
2802 def test_raise_by_type(self):
2803 self.check_wrapped(RuntimeError, "")
2804
2805 def test_raise_by_value(self):
2806 msg = "This should be wrapped"
2807 self.check_wrapped(RuntimeError(msg), msg)
2808
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002809 def test_raise_grandchild_subclass_exact_size(self):
2810 msg = "This should be wrapped"
2811 class MyRuntimeError(RuntimeError):
2812 __slots__ = ()
2813 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2814
2815 def test_raise_subclass_with_weakref_support(self):
2816 msg = "This should be wrapped"
2817 class MyRuntimeError(RuntimeError):
2818 pass
2819 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2820
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002821 def check_not_wrapped(self, obj_to_raise, msg):
2822 def raise_obj(*args, **kwds):
2823 raise obj_to_raise
2824 self.set_codec(raise_obj, raise_obj)
2825 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002826 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002827 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002828 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002829 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002830 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002831 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002832 codecs.decode(b"bytes input", self.codec_name)
2833
2834 def test_init_override_is_not_wrapped(self):
2835 class CustomInit(RuntimeError):
2836 def __init__(self):
2837 pass
2838 self.check_not_wrapped(CustomInit, "")
2839
2840 def test_new_override_is_not_wrapped(self):
2841 class CustomNew(RuntimeError):
2842 def __new__(cls):
2843 return super().__new__(cls)
2844 self.check_not_wrapped(CustomNew, "")
2845
2846 def test_instance_attribute_is_not_wrapped(self):
2847 msg = "This should NOT be wrapped"
2848 exc = RuntimeError(msg)
2849 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002850 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002851
2852 def test_non_str_arg_is_not_wrapped(self):
2853 self.check_not_wrapped(RuntimeError(1), "1")
2854
2855 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002856 msg_re = r"^\('a', 'b', 'c'\)$"
2857 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002858
2859 # http://bugs.python.org/issue19609
2860 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002861 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002862 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002863 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002864 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002865 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002866 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002867 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002868 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002869 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002870 codecs.decode(b"bytes input", self.codec_name)
2871
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002872 def test_unflagged_non_text_codec_handling(self):
2873 # The stdlib non-text codecs are now marked so they're
2874 # pre-emptively skipped by the text model related methods
2875 # However, third party codecs won't be flagged, so we still make
2876 # sure the case where an inappropriate output type is produced is
2877 # handled appropriately
2878 def encode_to_str(*args, **kwds):
2879 return "not bytes!", 0
2880 def decode_to_bytes(*args, **kwds):
2881 return b"not str!", 0
2882 self.set_codec(encode_to_str, decode_to_bytes)
2883 # No input or output type checks on the codecs module functions
2884 encoded = codecs.encode(None, self.codec_name)
2885 self.assertEqual(encoded, "not bytes!")
2886 decoded = codecs.decode(None, self.codec_name)
2887 self.assertEqual(decoded, b"not str!")
2888 # Text model methods should complain
2889 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2890 "use codecs.encode\(\) to encode to arbitrary types$")
2891 msg = fmt.format(self.codec_name)
2892 with self.assertRaisesRegex(TypeError, msg):
2893 "str_input".encode(self.codec_name)
2894 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2895 "use codecs.decode\(\) to decode to arbitrary types$")
2896 msg = fmt.format(self.codec_name)
2897 with self.assertRaisesRegex(TypeError, msg):
2898 b"bytes input".decode(self.codec_name)
2899
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002900
Georg Brandl02524622010-12-02 18:06:51 +00002901
Victor Stinner62be4fb2011-10-18 21:46:37 +02002902@unittest.skipUnless(sys.platform == 'win32',
2903 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002904class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002905 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002906 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002907
Victor Stinner3a50e702011-10-18 21:21:00 +02002908 def test_invalid_code_page(self):
2909 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2910 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002911 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2912 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002913
2914 def test_code_page_name(self):
2915 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2916 codecs.code_page_encode, 932, '\xff')
2917 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002918 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002919 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002920 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002921
2922 def check_decode(self, cp, tests):
2923 for raw, errors, expected in tests:
2924 if expected is not None:
2925 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002926 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002927 except UnicodeDecodeError as err:
2928 self.fail('Unable to decode %a from "cp%s" with '
2929 'errors=%r: %s' % (raw, cp, errors, err))
2930 self.assertEqual(decoded[0], expected,
2931 '%a.decode("cp%s", %r)=%a != %a'
2932 % (raw, cp, errors, decoded[0], expected))
2933 # assert 0 <= decoded[1] <= len(raw)
2934 self.assertGreaterEqual(decoded[1], 0)
2935 self.assertLessEqual(decoded[1], len(raw))
2936 else:
2937 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002938 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002939
2940 def check_encode(self, cp, tests):
2941 for text, errors, expected in tests:
2942 if expected is not None:
2943 try:
2944 encoded = codecs.code_page_encode(cp, text, errors)
2945 except UnicodeEncodeError as err:
2946 self.fail('Unable to encode %a to "cp%s" with '
2947 'errors=%r: %s' % (text, cp, errors, err))
2948 self.assertEqual(encoded[0], expected,
2949 '%a.encode("cp%s", %r)=%a != %a'
2950 % (text, cp, errors, encoded[0], expected))
2951 self.assertEqual(encoded[1], len(text))
2952 else:
2953 self.assertRaises(UnicodeEncodeError,
2954 codecs.code_page_encode, cp, text, errors)
2955
2956 def test_cp932(self):
2957 self.check_encode(932, (
2958 ('abc', 'strict', b'abc'),
2959 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002960 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002961 ('\xff', 'strict', None),
2962 ('[\xff]', 'ignore', b'[]'),
2963 ('[\xff]', 'replace', b'[y]'),
2964 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002965 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002966 ('[\xff]', 'namereplace',
2967 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002968 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002969 ('\udcff', 'strict', None),
2970 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2971 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002972 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002973 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002974 (b'abc', 'strict', 'abc'),
2975 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2976 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002977 (b'[\xff]', 'strict', None),
2978 (b'[\xff]', 'ignore', '[]'),
2979 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002980 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002981 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002982 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002983 (b'\x81\x00abc', 'strict', None),
2984 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002985 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01002986 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002987 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002988
2989 def test_cp1252(self):
2990 self.check_encode(1252, (
2991 ('abc', 'strict', b'abc'),
2992 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2993 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002994 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002995 ('\u0141', 'strict', None),
2996 ('\u0141', 'ignore', b''),
2997 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002998 ('\udc98', 'surrogateescape', b'\x98'),
2999 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003000 ))
3001 self.check_decode(1252, (
3002 (b'abc', 'strict', 'abc'),
3003 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3004 (b'\xff', 'strict', '\xff'),
3005 ))
3006
3007 def test_cp_utf7(self):
3008 cp = 65000
3009 self.check_encode(cp, (
3010 ('abc', 'strict', b'abc'),
3011 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3012 ('\U0010ffff', 'strict', b'+2//f/w-'),
3013 ('\udc80', 'strict', b'+3IA-'),
3014 ('\ufffd', 'strict', b'+//0-'),
3015 ))
3016 self.check_decode(cp, (
3017 (b'abc', 'strict', 'abc'),
3018 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3019 (b'+2//f/w-', 'strict', '\U0010ffff'),
3020 (b'+3IA-', 'strict', '\udc80'),
3021 (b'+//0-', 'strict', '\ufffd'),
3022 # invalid bytes
3023 (b'[+/]', 'strict', '[]'),
3024 (b'[\xff]', 'strict', '[\xff]'),
3025 ))
3026
Victor Stinner3a50e702011-10-18 21:21:00 +02003027 def test_multibyte_encoding(self):
3028 self.check_decode(932, (
3029 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3030 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3031 ))
3032 self.check_decode(self.CP_UTF8, (
3033 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3034 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3035 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003036 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003037 self.check_encode(self.CP_UTF8, (
3038 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3039 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3040 ))
3041
3042 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003043 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3044 self.assertEqual(decoded, ('', 0))
3045
Victor Stinner3a50e702011-10-18 21:21:00 +02003046 decoded = codecs.code_page_decode(932,
3047 b'\xe9\x80\xe9', 'strict',
3048 False)
3049 self.assertEqual(decoded, ('\u9a3e', 2))
3050
3051 decoded = codecs.code_page_decode(932,
3052 b'\xe9\x80\xe9\x80', 'strict',
3053 False)
3054 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3055
3056 decoded = codecs.code_page_decode(932,
3057 b'abc', 'strict',
3058 False)
3059 self.assertEqual(decoded, ('abc', 3))
3060
3061
Victor Stinnerf96418d2015-09-21 23:06:27 +02003062class ASCIITest(unittest.TestCase):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003063 def test_encode(self):
3064 self.assertEqual('abc123'.encode('ascii'), b'abc123')
3065
3066 def test_encode_error(self):
3067 for data, error_handler, expected in (
3068 ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3069 ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3070 ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
3071 ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
3072 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3073 ):
3074 with self.subTest(data=data, error_handler=error_handler,
3075 expected=expected):
3076 self.assertEqual(data.encode('ascii', error_handler),
3077 expected)
3078
3079 def test_encode_surrogateescape_error(self):
3080 with self.assertRaises(UnicodeEncodeError):
3081 # the first character can be decoded, but not the second
3082 '\udc80\xff'.encode('ascii', 'surrogateescape')
3083
Victor Stinnerf96418d2015-09-21 23:06:27 +02003084 def test_decode(self):
Victor Stinnerc3713e92015-09-29 12:32:13 +02003085 self.assertEqual(b'abc'.decode('ascii'), 'abc')
3086
3087 def test_decode_error(self):
Victor Stinnerf96418d2015-09-21 23:06:27 +02003088 for data, error_handler, expected in (
3089 (b'[\x80\xff]', 'ignore', '[]'),
3090 (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3091 (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3092 (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3093 ):
3094 with self.subTest(data=data, error_handler=error_handler,
3095 expected=expected):
3096 self.assertEqual(data.decode('ascii', error_handler),
3097 expected)
3098
3099
Victor Stinnerc3713e92015-09-29 12:32:13 +02003100class Latin1Test(unittest.TestCase):
3101 def test_encode(self):
3102 for data, expected in (
3103 ('abc', b'abc'),
3104 ('\x80\xe9\xff', b'\x80\xe9\xff'),
3105 ):
3106 with self.subTest(data=data, expected=expected):
3107 self.assertEqual(data.encode('latin1'), expected)
3108
3109 def test_encode_errors(self):
3110 for data, error_handler, expected in (
3111 ('[\u20ac\udc80]', 'ignore', b'[]'),
3112 ('[\u20ac\udc80]', 'replace', b'[??]'),
3113 ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
3114 ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3115 ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3116 ):
3117 with self.subTest(data=data, error_handler=error_handler,
3118 expected=expected):
3119 self.assertEqual(data.encode('latin1', error_handler),
3120 expected)
3121
3122 def test_encode_surrogateescape_error(self):
3123 with self.assertRaises(UnicodeEncodeError):
3124 # the first character can be decoded, but not the second
3125 '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3126
3127 def test_decode(self):
3128 for data, expected in (
3129 (b'abc', 'abc'),
3130 (b'[\x80\xff]', '[\x80\xff]'),
3131 ):
3132 with self.subTest(data=data, expected=expected):
3133 self.assertEqual(data.decode('latin1'), expected)
3134
3135
Fred Drake2e2be372001-09-20 21:33:42 +00003136if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003137 unittest.main()