blob: 04795427454b05edb22c9197956bc5d650366d03 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Martin Panter7462b6492015-11-02 03:37:02 +0000100 # do the check again, this time using an incremental decoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200352 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200381 backslashreplace = ''.join('\\x%02x' % b
382 for b in self.ill_formed_sequence)
383 self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
384 before + backslashreplace + after)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200385
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200386class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000387 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200388 if sys.byteorder == 'little':
389 ill_formed_sequence = b"\x80\xdc\x00\x00"
390 else:
391 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000392
393 spamle = (b'\xff\xfe\x00\x00'
394 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
395 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
396 spambe = (b'\x00\x00\xfe\xff'
397 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
398 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
399
400 def test_only_one_bom(self):
401 _,_,reader,writer = codecs.lookup(self.encoding)
402 # encode some stream
403 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200404 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000405 f.write("spam")
406 f.write("spam")
407 d = s.getvalue()
408 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000409 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410 # try to read it back
411 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200412 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000413 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414
415 def test_badbom(self):
416 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200421 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 self.assertRaises(UnicodeError, f.read)
423
424 def test_partial(self):
425 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200426 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000427 [
428 "", # first byte of BOM read
429 "", # second byte of BOM read
430 "", # third byte of BOM read
431 "", # fourth byte of BOM read => byteorder known
432 "",
433 "",
434 "",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100",
447 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200448 "\x00\xff\u0100\uffff",
449 "\x00\xff\u0100\uffff",
450 "\x00\xff\u0100\uffff",
451 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000452 ]
453 )
454
Georg Brandl791f4e12009-09-17 11:41:24 +0000455 def test_handlers(self):
456 self.assertEqual(('\ufffd', 1),
457 codecs.utf_32_decode(b'\x01', 'replace', True))
458 self.assertEqual(('', 1),
459 codecs.utf_32_decode(b'\x01', 'ignore', True))
460
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 def test_errors(self):
462 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
463 b"\xff", "strict", True)
464
465 def test_decoder_state(self):
466 self.check_state_handling_decode(self.encoding,
467 "spamspam", self.spamle)
468 self.check_state_handling_decode(self.encoding,
469 "spamspam", self.spambe)
470
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000471 def test_issue8941(self):
472 # Issue #8941: insufficient result allocation when decoding into
473 # surrogate pairs on UCS-2 builds.
474 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
475 self.assertEqual('\U00010000' * 1024,
476 codecs.utf_32_decode(encoded_le)[0])
477 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
478 self.assertEqual('\U00010000' * 1024,
479 codecs.utf_32_decode(encoded_be)[0])
480
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200481class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200483 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484
485 def test_partial(self):
486 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200487 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000488 [
489 "",
490 "",
491 "",
492 "\x00",
493 "\x00",
494 "\x00",
495 "\x00",
496 "\x00\xff",
497 "\x00\xff",
498 "\x00\xff",
499 "\x00\xff",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff",
508 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000509 ]
510 )
511
512 def test_simple(self):
513 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
514
515 def test_errors(self):
516 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
517 b"\xff", "strict", True)
518
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000519 def test_issue8941(self):
520 # Issue #8941: insufficient result allocation when decoding into
521 # surrogate pairs on UCS-2 builds.
522 encoded = b'\x00\x00\x01\x00' * 1024
523 self.assertEqual('\U00010000' * 1024,
524 codecs.utf_32_le_decode(encoded)[0])
525
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200526class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200528 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529
530 def test_partial(self):
531 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200532 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000533 [
534 "",
535 "",
536 "",
537 "\x00",
538 "\x00",
539 "\x00",
540 "\x00",
541 "\x00\xff",
542 "\x00\xff",
543 "\x00\xff",
544 "\x00\xff",
545 "\x00\xff\u0100",
546 "\x00\xff\u0100",
547 "\x00\xff\u0100",
548 "\x00\xff\u0100",
549 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff",
551 "\x00\xff\u0100\uffff",
552 "\x00\xff\u0100\uffff",
553 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000554 ]
555 )
556
557 def test_simple(self):
558 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
559
560 def test_errors(self):
561 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
562 b"\xff", "strict", True)
563
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000564 def test_issue8941(self):
565 # Issue #8941: insufficient result allocation when decoding into
566 # surrogate pairs on UCS-2 builds.
567 encoded = b'\x00\x01\x00\x00' * 1024
568 self.assertEqual('\U00010000' * 1024,
569 codecs.utf_32_be_decode(encoded)[0])
570
571
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200572class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000573 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200574 if sys.byteorder == 'little':
575 ill_formed_sequence = b"\x80\xdc"
576 else:
577 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000579 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
580 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000581
582 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000585 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200586 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 f.write("spam")
588 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000589 d = s.getvalue()
590 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000591 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000593 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200594 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000595 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000596
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000597 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000602 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200603 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000604 self.assertRaises(UnicodeError, f.read)
605
Walter Dörwald69652032004-09-07 20:24:22 +0000606 def test_partial(self):
607 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200608 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000609 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000610 "", # first byte of BOM read
611 "", # second byte of BOM read => byteorder known
612 "",
613 "\x00",
614 "\x00",
615 "\x00\xff",
616 "\x00\xff",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100",
619 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff",
623 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000624 ]
625 )
626
Georg Brandl791f4e12009-09-17 11:41:24 +0000627 def test_handlers(self):
628 self.assertEqual(('\ufffd', 1),
629 codecs.utf_16_decode(b'\x01', 'replace', True))
630 self.assertEqual(('', 1),
631 codecs.utf_16_decode(b'\x01', 'ignore', True))
632
Walter Dörwalde22d3392005-11-17 08:52:34 +0000633 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000635 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636
637 def test_decoder_state(self):
638 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000639 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000643 def test_bug691291(self):
644 # Files are always opened in binary mode, even if no binary mode was
645 # specified. This means that no automatic conversion of '\n' is done
646 # on reading and writing.
647 s1 = 'Hello\r\nworld\r\n'
648
649 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.addCleanup(support.unlink, support.TESTFN)
651 with open(support.TESTFN, 'wb') as fp:
652 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200653 with support.check_warnings(('', DeprecationWarning)):
654 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
655 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200656 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000657
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200658class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000659 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200660 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000661
662 def test_partial(self):
663 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200664 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000666 "",
667 "\x00",
668 "\x00",
669 "\x00\xff",
670 "\x00\xff",
671 "\x00\xff\u0100",
672 "\x00\xff\u0100",
673 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200674 "\x00\xff\u0100\uffff",
675 "\x00\xff\u0100\uffff",
676 "\x00\xff\u0100\uffff",
677 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000678 ]
679 )
680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200682 tests = [
683 (b'\xff', '\ufffd'),
684 (b'A\x00Z', 'A\ufffd'),
685 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
686 (b'\x00\xd8', '\ufffd'),
687 (b'\x00\xd8A', '\ufffd'),
688 (b'\x00\xd8A\x00', '\ufffdA'),
689 (b'\x00\xdcA\x00', '\ufffdA'),
690 ]
691 for raw, expected in tests:
692 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
693 raw, 'strict', True)
694 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
Victor Stinner53a9dd72010-12-08 22:25:45 +0000696 def test_nonbmp(self):
697 self.assertEqual("\U00010203".encode(self.encoding),
698 b'\x00\xd8\x03\xde')
699 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
700 "\U00010203")
701
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200702class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000703 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200704 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000705
706 def test_partial(self):
707 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200708 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000709 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000710 "",
711 "\x00",
712 "\x00",
713 "\x00\xff",
714 "\x00\xff",
715 "\x00\xff\u0100",
716 "\x00\xff\u0100",
717 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200718 "\x00\xff\u0100\uffff",
719 "\x00\xff\u0100\uffff",
720 "\x00\xff\u0100\uffff",
721 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000722 ]
723 )
724
Walter Dörwalde22d3392005-11-17 08:52:34 +0000725 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200726 tests = [
727 (b'\xff', '\ufffd'),
728 (b'\x00A\xff', 'A\ufffd'),
729 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
730 (b'\xd8\x00', '\ufffd'),
731 (b'\xd8\x00\xdc', '\ufffd'),
732 (b'\xd8\x00\x00A', '\ufffdA'),
733 (b'\xdc\x00\x00A', '\ufffdA'),
734 ]
735 for raw, expected in tests:
736 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
737 raw, 'strict', True)
738 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000739
Victor Stinner53a9dd72010-12-08 22:25:45 +0000740 def test_nonbmp(self):
741 self.assertEqual("\U00010203".encode(self.encoding),
742 b'\xd8\x00\xde\x03')
743 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
744 "\U00010203")
745
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200746class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000747 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200748 ill_formed_sequence = b"\xed\xb2\x80"
749 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000750
751 def test_partial(self):
752 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200753 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000754 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000755 "\x00",
756 "\x00",
757 "\x00\xff",
758 "\x00\xff",
759 "\x00\xff\u07ff",
760 "\x00\xff\u07ff",
761 "\x00\xff\u07ff",
762 "\x00\xff\u07ff\u0800",
763 "\x00\xff\u07ff\u0800",
764 "\x00\xff\u07ff\u0800",
765 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200766 "\x00\xff\u07ff\u0800\uffff",
767 "\x00\xff\u07ff\u0800\uffff",
768 "\x00\xff\u07ff\u0800\uffff",
769 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000770 ]
771 )
772
Walter Dörwald3abcb012007-04-16 22:10:50 +0000773 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000775 self.check_state_handling_decode(self.encoding,
776 u, u.encode(self.encoding))
777
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200779 super().test_lone_surrogates()
780 # not sure if this is making sense for
781 # UTF-16 and UTF-32
782 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000783 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000784
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000785 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000786 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
787 b"abc\xed\xa0\x80def")
788 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
789 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
791 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
792 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
793 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000794 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700795 with self.assertRaises(UnicodeDecodeError):
796 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200797 with self.assertRaises(UnicodeDecodeError):
798 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000799
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200800@unittest.skipUnless(sys.platform == 'win32',
801 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200802class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200803 encoding = "cp65001"
804
805 def test_encode(self):
806 tests = [
807 ('abc', 'strict', b'abc'),
808 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
809 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
810 ]
811 if VISTA_OR_LATER:
812 tests.extend((
813 ('\udc80', 'strict', None),
814 ('\udc80', 'ignore', b''),
815 ('\udc80', 'replace', b'?'),
816 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200817 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200818 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
819 ))
820 else:
821 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
822 for text, errors, expected in tests:
823 if expected is not None:
824 try:
825 encoded = text.encode('cp65001', errors)
826 except UnicodeEncodeError as err:
827 self.fail('Unable to encode %a to cp65001 with '
828 'errors=%r: %s' % (text, errors, err))
829 self.assertEqual(encoded, expected,
830 '%a.encode("cp65001", %r)=%a != %a'
831 % (text, errors, encoded, expected))
832 else:
833 self.assertRaises(UnicodeEncodeError,
834 text.encode, "cp65001", errors)
835
836 def test_decode(self):
837 tests = [
838 (b'abc', 'strict', 'abc'),
839 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
840 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
841 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
842 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
843 # invalid bytes
844 (b'[\xff]', 'strict', None),
845 (b'[\xff]', 'ignore', '[]'),
846 (b'[\xff]', 'replace', '[\ufffd]'),
847 (b'[\xff]', 'surrogateescape', '[\udcff]'),
848 ]
849 if VISTA_OR_LATER:
850 tests.extend((
851 (b'[\xed\xb2\x80]', 'strict', None),
852 (b'[\xed\xb2\x80]', 'ignore', '[]'),
853 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
854 ))
855 else:
856 tests.extend((
857 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
858 ))
859 for raw, errors, expected in tests:
860 if expected is not None:
861 try:
862 decoded = raw.decode('cp65001', errors)
863 except UnicodeDecodeError as err:
864 self.fail('Unable to decode %a from cp65001 with '
865 'errors=%r: %s' % (raw, errors, err))
866 self.assertEqual(decoded, expected,
867 '%a.decode("cp65001", %r)=%a != %a'
868 % (raw, errors, decoded, expected))
869 else:
870 self.assertRaises(UnicodeDecodeError,
871 raw.decode, 'cp65001', errors)
872
873 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
874 def test_lone_surrogates(self):
875 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
876 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
877 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
878 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200879 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
880 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200881 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
882 b'[&#56448;]')
883 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
884 b'[\x80]')
885 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
886 b'[]')
887 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
888 b'[?]')
889
890 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
891 def test_surrogatepass_handler(self):
892 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
893 b"abc\xed\xa0\x80def")
894 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
895 "abc\ud800def")
896 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
897 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
898 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
899 "\U00010fff\uD800")
900 self.assertTrue(codecs.lookup_error("surrogatepass"))
901
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200902
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200903class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000904 encoding = "utf-7"
905
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300906 def test_ascii(self):
907 # Set D (directly encoded characters)
908 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
909 'abcdefghijklmnopqrstuvwxyz'
910 '0123456789'
911 '\'(),-./:?')
912 self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
913 self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
914 # Set O (optional direct characters)
915 set_o = ' !"#$%&*;<=>@[]^_`{|}'
916 self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
917 self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
918 # +
919 self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
920 self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
921 # White spaces
922 ws = ' \t\n\r'
923 self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
924 self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
925 # Other ASCII characters
926 other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
927 set(set_d + set_o + '+' + ws)))
928 self.assertEqual(other_ascii.encode(self.encoding),
929 b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
930 b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
931
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000932 def test_partial(self):
933 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200934 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000935 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200936 'a',
937 'a',
938 'a+',
939 'a+-',
940 'a+-b',
941 'a+-b',
942 'a+-b',
943 'a+-b',
944 'a+-b',
945 'a+-b\x00',
946 'a+-b\x00c',
947 'a+-b\x00c',
948 'a+-b\x00c',
949 'a+-b\x00c',
950 'a+-b\x00c',
951 'a+-b\x00c\x80',
952 'a+-b\x00c\x80d',
953 'a+-b\x00c\x80d',
954 'a+-b\x00c\x80d',
955 'a+-b\x00c\x80d',
956 'a+-b\x00c\x80d',
957 'a+-b\x00c\x80d\u0100',
958 'a+-b\x00c\x80d\u0100e',
959 'a+-b\x00c\x80d\u0100e',
960 'a+-b\x00c\x80d\u0100e',
961 'a+-b\x00c\x80d\u0100e',
962 'a+-b\x00c\x80d\u0100e',
963 'a+-b\x00c\x80d\u0100e',
964 'a+-b\x00c\x80d\u0100e',
965 'a+-b\x00c\x80d\u0100e',
966 'a+-b\x00c\x80d\u0100e\U00010000',
967 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000968 ]
969 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000970
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300971 def test_errors(self):
972 tests = [
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300973 (b'\xffb', '\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300974 (b'a\xffb', 'a\ufffdb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300975 (b'a\xff\xffb', 'a\ufffd\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300976 (b'a+IK', 'a\ufffd'),
977 (b'a+IK-b', 'a\ufffdb'),
978 (b'a+IK,b', 'a\ufffdb'),
979 (b'a+IKx', 'a\u20ac\ufffd'),
980 (b'a+IKx-b', 'a\u20ac\ufffdb'),
981 (b'a+IKwgr', 'a\u20ac\ufffd'),
982 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
983 (b'a+IKwgr,', 'a\u20ac\ufffd'),
984 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
985 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
986 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
987 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
988 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
989 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
990 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
Serhiy Storchaka28b21e52015-10-02 13:07:28 +0300991 (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
992 (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300993 ]
994 for raw, expected in tests:
995 with self.subTest(raw=raw):
996 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
997 raw, 'strict', True)
998 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
999
1000 def test_nonbmp(self):
1001 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1002 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1003 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001004 self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1005 self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1006 self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1007 self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1008 self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1009 b'+IKwgrNgB3KA-')
1010 self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1011 '\u20ac\u20ac\U000104A0')
1012 self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1013 '\u20ac\u20ac\U000104A0')
Serhiy Storchaka35804e42013-10-19 20:38:19 +03001014
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03001015 def test_lone_surrogates(self):
1016 tests = [
1017 (b'a+2AE-b', 'a\ud801b'),
1018 (b'a+2AE\xffb', 'a\ufffdb'),
1019 (b'a+2AE', 'a\ufffd'),
1020 (b'a+2AEA-b', 'a\ufffdb'),
1021 (b'a+2AH-b', 'a\ufffdb'),
1022 (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1023 (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1024 (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1025 (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1026 (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1027 (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1028 (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1029 (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1030 ]
1031 for raw, expected in tests:
1032 with self.subTest(raw=raw):
1033 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001034
1035
Walter Dörwalde22d3392005-11-17 08:52:34 +00001036class UTF16ExTest(unittest.TestCase):
1037
1038 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001039 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001040
1041 def test_bad_args(self):
1042 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1043
1044class ReadBufferTest(unittest.TestCase):
1045
1046 def test_array(self):
1047 import array
1048 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001049 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +00001050 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001051 )
1052
1053 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +00001054 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +00001055
1056 def test_bad_args(self):
1057 self.assertRaises(TypeError, codecs.readbuffer_encode)
1058 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1059
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001060class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001061 encoding = "utf-8-sig"
1062
1063 def test_partial(self):
1064 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001065 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001066 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001067 "",
1068 "",
1069 "", # First BOM has been read and skipped
1070 "",
1071 "",
1072 "\ufeff", # Second BOM has been read and emitted
1073 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001074 "\ufeff\x00", # First byte of encoded "\xff" read
1075 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1076 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1077 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001078 "\ufeff\x00\xff\u07ff",
1079 "\ufeff\x00\xff\u07ff",
1080 "\ufeff\x00\xff\u07ff\u0800",
1081 "\ufeff\x00\xff\u07ff\u0800",
1082 "\ufeff\x00\xff\u07ff\u0800",
1083 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001084 "\ufeff\x00\xff\u07ff\u0800\uffff",
1085 "\ufeff\x00\xff\u07ff\u0800\uffff",
1086 "\ufeff\x00\xff\u07ff\u0800\uffff",
1087 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001088 ]
1089 )
1090
Thomas Wouters89f507f2006-12-13 04:49:30 +00001091 def test_bug1601501(self):
1092 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001093 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001094
Walter Dörwald3abcb012007-04-16 22:10:50 +00001095 def test_bom(self):
1096 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001097 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001098 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1099
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001100 def test_stream_bom(self):
1101 unistring = "ABC\u00A1\u2200XYZ"
1102 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1103
1104 reader = codecs.getreader("utf-8-sig")
1105 for sizehint in [None] + list(range(1, 11)) + \
1106 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001107 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001108 ostream = io.StringIO()
1109 while 1:
1110 if sizehint is not None:
1111 data = istream.read(sizehint)
1112 else:
1113 data = istream.read()
1114
1115 if not data:
1116 break
1117 ostream.write(data)
1118
1119 got = ostream.getvalue()
1120 self.assertEqual(got, unistring)
1121
1122 def test_stream_bare(self):
1123 unistring = "ABC\u00A1\u2200XYZ"
1124 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1125
1126 reader = codecs.getreader("utf-8-sig")
1127 for sizehint in [None] + list(range(1, 11)) + \
1128 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001129 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001130 ostream = io.StringIO()
1131 while 1:
1132 if sizehint is not None:
1133 data = istream.read(sizehint)
1134 else:
1135 data = istream.read()
1136
1137 if not data:
1138 break
1139 ostream.write(data)
1140
1141 got = ostream.getvalue()
1142 self.assertEqual(got, unistring)
1143
1144class EscapeDecodeTest(unittest.TestCase):
1145 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001146 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Serhiy Storchaka8490f5a2015-03-20 09:00:36 +02001147 self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001148
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001149 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001150 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001151 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001152 b = bytes([b])
1153 if b != b'\\':
1154 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001155
1156 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001157 decode = codecs.escape_decode
1158 check = coding_checker(self, decode)
1159 check(b"[\\\n]", b"[]")
1160 check(br'[\"]', b'["]')
1161 check(br"[\']", b"[']")
1162 check(br"[\\]", br"[\]")
1163 check(br"[\a]", b"[\x07]")
1164 check(br"[\b]", b"[\x08]")
1165 check(br"[\t]", b"[\x09]")
1166 check(br"[\n]", b"[\x0a]")
1167 check(br"[\v]", b"[\x0b]")
1168 check(br"[\f]", b"[\x0c]")
1169 check(br"[\r]", b"[\x0d]")
1170 check(br"[\7]", b"[\x07]")
1171 check(br"[\8]", br"[\8]")
1172 check(br"[\78]", b"[\x078]")
1173 check(br"[\41]", b"[!]")
1174 check(br"[\418]", b"[!8]")
1175 check(br"[\101]", b"[A]")
1176 check(br"[\1010]", b"[A0]")
1177 check(br"[\501]", b"[A]")
1178 check(br"[\x41]", b"[A]")
1179 check(br"[\X41]", br"[\X41]")
1180 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001181 for b in range(256):
1182 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001183 b = bytes([b])
1184 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001185
1186 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001187 decode = codecs.escape_decode
1188 self.assertRaises(ValueError, decode, br"\x")
1189 self.assertRaises(ValueError, decode, br"[\x]")
1190 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1191 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1192 self.assertRaises(ValueError, decode, br"\x0")
1193 self.assertRaises(ValueError, decode, br"[\x0]")
1194 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1195 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001196
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001197class RecodingTest(unittest.TestCase):
1198 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001199 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001200 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001201 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001202 f2.close()
1203 # Python used to crash on this at exit because of a refcount
1204 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001205
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001206 self.assertTrue(f.closed)
1207
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208# From RFC 3492
1209punycode_testcases = [
1210 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001211 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1212 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001213 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001215 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001216 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001218 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001219 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001220 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001221 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1222 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1223 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001224 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001225 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1227 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1228 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001229 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001231 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001232 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1233 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1234 "\u0939\u0948\u0902",
1235 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236
1237 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001238 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001239 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1240 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001241
1242 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001243 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1244 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1245 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001246 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1247 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001248
1249 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001250 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1251 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1252 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1253 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001254 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001255
1256 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001257 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1258 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1259 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1260 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1261 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001262 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263
1264 # (K) Vietnamese:
1265 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1266 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1268 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1269 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1270 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001271 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001275 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001276
Martin v. Löwis2548c732003-04-18 10:39:54 +00001277 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001278 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1279 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1280 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001281 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001282
1283 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001284 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1285 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1286 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001287 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001288
1289 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001290 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001291 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001292
1293 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001294 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1295 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001296 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001297
1298 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001299 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001300 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301
1302 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001303 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001304 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305
1306 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001307 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1308 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001309 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 ]
1311
1312for i in punycode_testcases:
1313 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001314 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315
1316class PunycodeTest(unittest.TestCase):
1317 def test_encode(self):
1318 for uni, puny in punycode_testcases:
1319 # Need to convert both strings to lower case, since
1320 # some of the extended encodings use upper case, but our
1321 # code produces only lower case. Converting just puny to
1322 # lower is also insufficient, since some of the input characters
1323 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001324 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001325 str(uni.encode("punycode"), "ascii").lower(),
1326 str(puny, "ascii").lower()
1327 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328
1329 def test_decode(self):
1330 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001331 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001332 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001333 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001335class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001336 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001337 def test_bug1251300(self):
1338 # Decoding with unicode_internal used to not correctly handle "code
1339 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001340 ok = [
1341 (b"\x00\x10\xff\xff", "\U0010ffff"),
1342 (b"\x00\x00\x01\x01", "\U00000101"),
1343 (b"", ""),
1344 ]
1345 not_ok = [
1346 b"\x7f\xff\xff\xff",
1347 b"\x80\x00\x00\x00",
1348 b"\x81\x00\x00\x00",
1349 b"\x00",
1350 b"\x00\x00\x00\x00\x00",
1351 ]
1352 for internal, uni in ok:
1353 if sys.byteorder == "little":
1354 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001355 with support.check_warnings():
1356 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001357 for internal in not_ok:
1358 if sys.byteorder == "little":
1359 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001360 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001361 'deprecated', DeprecationWarning)):
1362 self.assertRaises(UnicodeDecodeError, internal.decode,
1363 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001364 if sys.byteorder == "little":
1365 invalid = b"\x00\x00\x11\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001366 invalid_backslashreplace = r"\x00\x00\x11\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001367 else:
1368 invalid = b"\x00\x11\x00\x00"
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001369 invalid_backslashreplace = r"\x00\x11\x00\x00"
Victor Stinnere3b47152011-12-09 20:49:49 +01001370 with support.check_warnings():
1371 self.assertRaises(UnicodeDecodeError,
1372 invalid.decode, "unicode_internal")
1373 with support.check_warnings():
1374 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1375 '\ufffd')
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001376 with support.check_warnings():
1377 self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1378 invalid_backslashreplace)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001379
Victor Stinner182d90d2011-09-29 19:53:55 +02001380 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001381 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001382 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001383 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001384 'deprecated', DeprecationWarning)):
1385 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001386 except UnicodeDecodeError as ex:
1387 self.assertEqual("unicode_internal", ex.encoding)
1388 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1389 self.assertEqual(4, ex.start)
1390 self.assertEqual(8, ex.end)
1391 else:
1392 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001393
Victor Stinner182d90d2011-09-29 19:53:55 +02001394 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001395 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001396 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1397 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001398 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001399 'deprecated', DeprecationWarning)):
1400 ab = "ab".encode("unicode_internal").decode()
1401 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1402 "ascii"),
1403 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001404 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001405
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001406 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001407 with support.check_warnings(('unicode_internal codec has been '
1408 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001409 # Issue 3739
1410 encoder = codecs.getencoder("unicode_internal")
1411 self.assertEqual(encoder("a")[1], 1)
1412 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1413
1414 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001415
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1417nameprep_tests = [
1418 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1420 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1421 b'\xb8\x8f\xef\xbb\xbf',
1422 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001424 (b'CAFE',
1425 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 # 3.3 Case folding 8bit U+00DF (german sharp s).
1427 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xc3\x9f',
1429 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001430 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xc4\xb0',
1432 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xc5\x83\xcd\xba',
1435 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1437 # XXX: skip this as it fails in UCS-2 mode
1438 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1439 # 'telc\xe2\x88\x95kg\xcf\x83'),
1440 (None, None),
1441 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001442 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1443 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001445 (b'\xe1\xbe\xb7',
1446 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 # 3.9 Self-reverting case folding U+01F0 and normalization.
1448 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xc7\xb0',
1450 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xce\x90',
1453 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001454 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xce\xb0',
1456 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001457 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xe1\xba\x96',
1459 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'\xe1\xbd\x96',
1462 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001463 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b' ',
1465 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001466 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'\xc2\xa0',
1468 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xe2\x80\x80',
1474 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001476 (b'\xe2\x80\x8b',
1477 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001478 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001479 (b'\xe3\x80\x80',
1480 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001481 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'\x10\x7f',
1483 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001485 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001486 None),
1487 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001488 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489 None),
1490 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001491 (b'\xef\xbb\xbf',
1492 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001493 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001495 None),
1496 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001497 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001498 None),
1499 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001501 None),
1502 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001503 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 None),
1505 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001506 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001507 None),
1508 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001509 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001510 None),
1511 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001512 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513 None),
1514 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001515 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001516 None),
1517 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001518 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001519 None),
1520 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001521 (b'\xcd\x81',
1522 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001523 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001524 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001525 None),
1526 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001527 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001528 None),
1529 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001530 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001531 None),
1532 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001533 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001534 None),
1535 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001536 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001537 None),
1538 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001539 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001540 None),
1541 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001542 (b'foo\xef\xb9\xb6bar',
1543 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001544 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001545 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001546 None),
1547 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001548 (b'\xd8\xa71\xd8\xa8',
1549 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001550 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001551 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001552 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001553 # None),
1554 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001555 # 3.44 Larger test (shrinking).
1556 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001557 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1558 b'\xaa\xce\xb0\xe2\x80\x80',
1559 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001560 # 3.45 Larger test (expanding).
1561 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001562 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1563 b'\x80',
1564 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1565 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1566 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001567 ]
1568
1569
1570class NameprepTest(unittest.TestCase):
1571 def test_nameprep(self):
1572 from encodings.idna import nameprep
1573 for pos, (orig, prepped) in enumerate(nameprep_tests):
1574 if orig is None:
1575 # Skipped
1576 continue
1577 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001578 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001579 if prepped is None:
1580 # Input contains prohibited characters
1581 self.assertRaises(UnicodeError, nameprep, orig)
1582 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001583 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001584 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001585 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001586 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001587 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001588
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001589class IDNACodecTest(unittest.TestCase):
1590 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001591 self.assertEqual(str(b"python.org", "idna"), "python.org")
1592 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1593 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1594 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001595
1596 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001597 self.assertEqual("python.org".encode("idna"), b"python.org")
1598 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1599 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1600 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001601
Martin v. Löwis8b595142005-08-25 11:03:38 +00001602 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001603 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001604 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001605 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001606
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001607 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001608 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001609 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001610 "python.org"
1611 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001613 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001614 "python.org."
1615 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001616 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001617 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001618 "pyth\xf6n.org."
1619 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001620 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001621 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001622 "pyth\xf6n.org."
1623 )
1624
1625 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001626 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1627 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1628 self.assertEqual(decoder.decode(b"rg"), "")
1629 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001630
1631 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001632 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1633 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1634 self.assertEqual(decoder.decode(b"rg."), "org.")
1635 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001636
1637 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001638 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001639 b"".join(codecs.iterencode("python.org", "idna")),
1640 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001641 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001642 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001643 b"".join(codecs.iterencode("python.org.", "idna")),
1644 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001645 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001646 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001647 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1648 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001649 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001650 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001651 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1652 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001653 )
1654
1655 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001656 self.assertEqual(encoder.encode("\xe4x"), b"")
1657 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1658 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001659
1660 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001661 self.assertEqual(encoder.encode("\xe4x"), b"")
1662 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1663 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001664
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001665 def test_errors(self):
1666 """Only supports "strict" error handler"""
1667 "python.org".encode("idna", "strict")
1668 b"python.org".decode("idna", "strict")
1669 for errors in ("ignore", "replace", "backslashreplace",
1670 "surrogateescape"):
1671 self.assertRaises(Exception, "python.org".encode, "idna", errors)
1672 self.assertRaises(Exception,
1673 b"python.org".decode, "idna", errors)
1674
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001675class CodecsModuleTest(unittest.TestCase):
1676
1677 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001678 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1679 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001680 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001681 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001682 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001683
Victor Stinnera57dfd02014-05-14 17:13:14 +02001684 # test keywords
1685 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1686 '\xe4\xf6\xfc')
1687 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1688 '[]')
1689
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001690 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001691 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1692 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001693 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001694 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001695 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001696 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001697
Victor Stinnera57dfd02014-05-14 17:13:14 +02001698 # test keywords
1699 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1700 b'\xe4\xf6\xfc')
1701 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1702 b'[]')
1703
Walter Dörwald063e1e82004-10-28 13:04:26 +00001704 def test_register(self):
1705 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001706 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001707
1708 def test_lookup(self):
1709 self.assertRaises(TypeError, codecs.lookup)
1710 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001711 self.assertRaises(LookupError, codecs.lookup, " ")
1712
1713 def test_getencoder(self):
1714 self.assertRaises(TypeError, codecs.getencoder)
1715 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1716
1717 def test_getdecoder(self):
1718 self.assertRaises(TypeError, codecs.getdecoder)
1719 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1720
1721 def test_getreader(self):
1722 self.assertRaises(TypeError, codecs.getreader)
1723 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1724
1725 def test_getwriter(self):
1726 self.assertRaises(TypeError, codecs.getwriter)
1727 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001728
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001729 def test_lookup_issue1813(self):
1730 # Issue #1813: under Turkish locales, lookup of some codecs failed
1731 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001732 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001733 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1734 try:
1735 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1736 except locale.Error:
1737 # Unsupported locale on this system
1738 self.skipTest('test needs Turkish locale')
1739 c = codecs.lookup('ASCII')
1740 self.assertEqual(c.name, 'ascii')
1741
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +02001742 def test_all(self):
1743 api = (
1744 "encode", "decode",
1745 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1746 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1747 "getencoder", "getdecoder", "getincrementalencoder",
1748 "getincrementaldecoder", "getreader", "getwriter",
1749 "register_error", "lookup_error",
1750 "strict_errors", "replace_errors", "ignore_errors",
1751 "xmlcharrefreplace_errors", "backslashreplace_errors",
1752 "namereplace_errors",
1753 "open", "EncodedFile",
1754 "iterencode", "iterdecode",
1755 "BOM", "BOM_BE", "BOM_LE",
1756 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1757 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1758 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1759 "StreamReaderWriter", "StreamRecoder",
1760 )
1761 self.assertCountEqual(api, codecs.__all__)
1762 for api in codecs.__all__:
1763 getattr(codecs, api)
1764
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001765 def test_open(self):
1766 self.addCleanup(support.unlink, support.TESTFN)
1767 for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1768 with self.subTest(mode), \
1769 codecs.open(support.TESTFN, mode, 'ascii') as file:
1770 self.assertIsInstance(file, codecs.StreamReaderWriter)
1771
1772 def test_undefined(self):
1773 self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1774 self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1775 self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1776 self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1777 for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1778 self.assertRaises(UnicodeError,
1779 codecs.encode, 'abc', 'undefined', errors)
1780 self.assertRaises(UnicodeError,
1781 codecs.decode, b'abc', 'undefined', errors)
1782
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001783class StreamReaderTest(unittest.TestCase):
1784
1785 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001786 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001787 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001788
1789 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001790 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001791 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001792
Thomas Wouters89f507f2006-12-13 04:49:30 +00001793class EncodedFileTest(unittest.TestCase):
1794
1795 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001796 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001797 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001798 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001799
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001800 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001801 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001802 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001803 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001804
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001805all_unicode_encodings = [
1806 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001807 "big5",
1808 "big5hkscs",
1809 "charmap",
1810 "cp037",
1811 "cp1006",
1812 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001813 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001814 "cp1140",
1815 "cp1250",
1816 "cp1251",
1817 "cp1252",
1818 "cp1253",
1819 "cp1254",
1820 "cp1255",
1821 "cp1256",
1822 "cp1257",
1823 "cp1258",
1824 "cp424",
1825 "cp437",
1826 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001827 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001828 "cp737",
1829 "cp775",
1830 "cp850",
1831 "cp852",
1832 "cp855",
1833 "cp856",
1834 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001835 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001836 "cp860",
1837 "cp861",
1838 "cp862",
1839 "cp863",
1840 "cp864",
1841 "cp865",
1842 "cp866",
1843 "cp869",
1844 "cp874",
1845 "cp875",
1846 "cp932",
1847 "cp949",
1848 "cp950",
1849 "euc_jis_2004",
1850 "euc_jisx0213",
1851 "euc_jp",
1852 "euc_kr",
1853 "gb18030",
1854 "gb2312",
1855 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001856 "hp_roman8",
1857 "hz",
1858 "idna",
1859 "iso2022_jp",
1860 "iso2022_jp_1",
1861 "iso2022_jp_2",
1862 "iso2022_jp_2004",
1863 "iso2022_jp_3",
1864 "iso2022_jp_ext",
1865 "iso2022_kr",
1866 "iso8859_1",
1867 "iso8859_10",
1868 "iso8859_11",
1869 "iso8859_13",
1870 "iso8859_14",
1871 "iso8859_15",
1872 "iso8859_16",
1873 "iso8859_2",
1874 "iso8859_3",
1875 "iso8859_4",
1876 "iso8859_5",
1877 "iso8859_6",
1878 "iso8859_7",
1879 "iso8859_8",
1880 "iso8859_9",
1881 "johab",
1882 "koi8_r",
Serhiy Storchakaf0eeedf2015-05-12 23:24:19 +03001883 "koi8_t",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001884 "koi8_u",
Serhiy Storchakaad8a1c32015-05-12 23:16:55 +03001885 "kz1048",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001886 "latin_1",
1887 "mac_cyrillic",
1888 "mac_greek",
1889 "mac_iceland",
1890 "mac_latin2",
1891 "mac_roman",
1892 "mac_turkish",
1893 "palmos",
1894 "ptcp154",
1895 "punycode",
1896 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001897 "shift_jis",
1898 "shift_jis_2004",
1899 "shift_jisx0213",
1900 "tis_620",
1901 "unicode_escape",
1902 "unicode_internal",
1903 "utf_16",
1904 "utf_16_be",
1905 "utf_16_le",
1906 "utf_7",
1907 "utf_8",
1908]
1909
1910if hasattr(codecs, "mbcs_encode"):
1911 all_unicode_encodings.append("mbcs")
1912
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001913# The following encoding is not tested, because it's not supposed
1914# to work:
1915# "undefined"
1916
1917# The following encodings don't work in stateful mode
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001918broken_unicode_with_stateful = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001919 "punycode",
1920 "unicode_internal"
1921]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001922
Walter Dörwald3abcb012007-04-16 22:10:50 +00001923class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001924 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001925 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001926 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001927 name = codecs.lookup(encoding).name
1928 if encoding.endswith("_codec"):
1929 name += "_codec"
1930 elif encoding == "latin_1":
1931 name = "latin_1"
1932 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001933
Ezio Melottiadc417c2011-11-17 12:23:34 +02001934 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001935 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001936 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001937 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001938 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001939 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001940
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001941 if encoding not in broken_unicode_with_stateful:
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001942 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001943 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001944 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001945 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001946 for c in s:
1947 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001948 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001949 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001950 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001951 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001952 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001953 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001954 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001955 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001956 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001957 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001958
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10001959 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001960 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001961 try:
1962 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001963 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001964 pass
1965 else:
1966 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001967 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001968 for c in s:
1969 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001970 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001971 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001972 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001973 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001974 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001975 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001976 self.assertEqual(decodedresult, s,
1977 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001978
1979 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001980 result = "".join(codecs.iterdecode(
1981 codecs.iterencode(s, encoding), encoding))
1982 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001983
1984 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001985 result = "".join(codecs.iterdecode(
1986 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001987 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001988
Victor Stinner554f3f02010-06-16 23:33:54 +00001989 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001990 # check incremental decoder/encoder with errors argument
1991 try:
1992 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001993 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001994 pass
1995 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001996 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001997 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001998 decodedresult = "".join(decoder.decode(bytes([c]))
1999 for c in encodedresult)
2000 self.assertEqual(decodedresult, s,
2001 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002002
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002003 @support.cpython_only
2004 def test_basics_capi(self):
2005 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2006 s = "abc123" # all codecs should be able to encode these
2007 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002008 if encoding not in broken_unicode_with_stateful:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002009 # check incremental decoder/encoder (fetched via the C API)
2010 try:
2011 cencoder = codec_incrementalencoder(encoding)
2012 except LookupError: # no IncrementalEncoder
2013 pass
2014 else:
2015 # check C API
2016 encodedresult = b""
2017 for c in s:
2018 encodedresult += cencoder.encode(c)
2019 encodedresult += cencoder.encode("", True)
2020 cdecoder = codec_incrementaldecoder(encoding)
2021 decodedresult = ""
2022 for c in encodedresult:
2023 decodedresult += cdecoder.decode(bytes([c]))
2024 decodedresult += cdecoder.decode(b"", True)
2025 self.assertEqual(decodedresult, s,
2026 "encoding=%r" % encoding)
2027
2028 if encoding not in ("idna", "mbcs"):
2029 # check incremental decoder/encoder with errors argument
2030 try:
2031 cencoder = codec_incrementalencoder(encoding, "ignore")
2032 except LookupError: # no IncrementalEncoder
2033 pass
2034 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002035 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02002036 cdecoder = codec_incrementaldecoder(encoding, "ignore")
2037 decodedresult = "".join(cdecoder.decode(bytes([c]))
2038 for c in encodedresult)
2039 self.assertEqual(decodedresult, s,
2040 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002041
Walter Dörwald729c31f2005-03-14 19:06:30 +00002042 def test_seek(self):
2043 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002044 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00002045 for encoding in all_unicode_encodings:
2046 if encoding == "idna": # FIXME: See SF bug #1163178
2047 continue
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002048 if encoding in broken_unicode_with_stateful:
Walter Dörwald729c31f2005-03-14 19:06:30 +00002049 continue
Victor Stinner05010702011-05-27 16:50:40 +02002050 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00002051 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00002052 # Test that calling seek resets the internal codec state and buffers
2053 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00002054 data = reader.read()
2055 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00002056
Walter Dörwalde22d3392005-11-17 08:52:34 +00002057 def test_bad_decode_args(self):
2058 for encoding in all_unicode_encodings:
2059 decoder = codecs.getdecoder(encoding)
2060 self.assertRaises(TypeError, decoder)
2061 if encoding not in ("idna", "punycode"):
2062 self.assertRaises(TypeError, decoder, 42)
2063
2064 def test_bad_encode_args(self):
2065 for encoding in all_unicode_encodings:
2066 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02002067 with support.check_warnings():
2068 # unicode-internal has been deprecated
2069 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00002070
Thomas Wouters0e3f5912006-08-11 14:57:12 +00002071 def test_encoding_map_type_initialized(self):
2072 from encodings import cp1140
2073 # This used to crash, we are only verifying there's no crash.
2074 table_type = type(cp1140.encoding_table)
2075 self.assertEqual(table_type, table_type)
2076
Walter Dörwald3abcb012007-04-16 22:10:50 +00002077 def test_decoder_state(self):
2078 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002079 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00002080 for encoding in all_unicode_encodings:
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002081 if encoding not in broken_unicode_with_stateful:
Walter Dörwald3abcb012007-04-16 22:10:50 +00002082 self.check_state_handling_decode(encoding, u, u.encode(encoding))
2083 self.check_state_handling_encode(encoding, u, u.encode(encoding))
2084
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002085class CharmapTest(unittest.TestCase):
2086 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00002087 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002088 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002089 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002090 )
2091
Ezio Melottib3aedd42010-11-20 19:04:17 +00002092 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02002093 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2094 ("\U0010FFFFbc", 3)
2095 )
2096
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002097 self.assertRaises(UnicodeDecodeError,
2098 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2099 )
2100
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002101 self.assertRaises(UnicodeDecodeError,
2102 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2103 )
2104
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002105 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002106 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002107 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002108 )
2109
Ezio Melottib3aedd42010-11-20 19:04:17 +00002110 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002111 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002112 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002113 )
2114
Ezio Melottib3aedd42010-11-20 19:04:17 +00002115 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002116 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2117 ("ab\\x02", 3)
2118 )
2119
2120 self.assertEqual(
2121 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2122 ("ab\\x02", 3)
2123 )
2124
2125 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002126 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002127 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002128 )
2129
Ezio Melottib3aedd42010-11-20 19:04:17 +00002130 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002131 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002132 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002133 )
2134
Guido van Rossum805365e2007-05-07 22:24:25 +00002135 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002136 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002137 codecs.charmap_decode(allbytes, "ignore", ""),
2138 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002139 )
2140
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002141 def test_decode_with_int2str_map(self):
2142 self.assertEqual(
2143 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2144 {0: 'a', 1: 'b', 2: 'c'}),
2145 ("abc", 3)
2146 )
2147
2148 self.assertEqual(
2149 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2150 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2151 ("AaBbCc", 3)
2152 )
2153
2154 self.assertEqual(
2155 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2156 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2157 ("\U0010FFFFbc", 3)
2158 )
2159
2160 self.assertEqual(
2161 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2162 {0: 'a', 1: 'b', 2: ''}),
2163 ("ab", 3)
2164 )
2165
2166 self.assertRaises(UnicodeDecodeError,
2167 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2168 {0: 'a', 1: 'b'}
2169 )
2170
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002171 self.assertRaises(UnicodeDecodeError,
2172 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2173 {0: 'a', 1: 'b', 2: None}
2174 )
2175
2176 # Issue #14850
2177 self.assertRaises(UnicodeDecodeError,
2178 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2179 {0: 'a', 1: 'b', 2: '\ufffe'}
2180 )
2181
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002182 self.assertEqual(
2183 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2184 {0: 'a', 1: 'b'}),
2185 ("ab\ufffd", 3)
2186 )
2187
2188 self.assertEqual(
2189 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2190 {0: 'a', 1: 'b', 2: None}),
2191 ("ab\ufffd", 3)
2192 )
2193
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002194 # Issue #14850
2195 self.assertEqual(
2196 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2197 {0: 'a', 1: 'b', 2: '\ufffe'}),
2198 ("ab\ufffd", 3)
2199 )
2200
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002201 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002202 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2203 {0: 'a', 1: 'b'}),
2204 ("ab\\x02", 3)
2205 )
2206
2207 self.assertEqual(
2208 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2209 {0: 'a', 1: 'b', 2: None}),
2210 ("ab\\x02", 3)
2211 )
2212
2213 # Issue #14850
2214 self.assertEqual(
2215 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2216 {0: 'a', 1: 'b', 2: '\ufffe'}),
2217 ("ab\\x02", 3)
2218 )
2219
2220 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002221 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2222 {0: 'a', 1: 'b'}),
2223 ("ab", 3)
2224 )
2225
2226 self.assertEqual(
2227 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2228 {0: 'a', 1: 'b', 2: None}),
2229 ("ab", 3)
2230 )
2231
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002232 # Issue #14850
2233 self.assertEqual(
2234 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2235 {0: 'a', 1: 'b', 2: '\ufffe'}),
2236 ("ab", 3)
2237 )
2238
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002239 allbytes = bytes(range(256))
2240 self.assertEqual(
2241 codecs.charmap_decode(allbytes, "ignore", {}),
2242 ("", len(allbytes))
2243 )
2244
2245 def test_decode_with_int2int_map(self):
2246 a = ord('a')
2247 b = ord('b')
2248 c = ord('c')
2249
2250 self.assertEqual(
2251 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2252 {0: a, 1: b, 2: c}),
2253 ("abc", 3)
2254 )
2255
2256 # Issue #15379
2257 self.assertEqual(
2258 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2259 {0: 0x10FFFF, 1: b, 2: c}),
2260 ("\U0010FFFFbc", 3)
2261 )
2262
Antoine Pitroua1f76552012-09-23 20:00:04 +02002263 self.assertEqual(
2264 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2265 {0: sys.maxunicode, 1: b, 2: c}),
2266 (chr(sys.maxunicode) + "bc", 3)
2267 )
2268
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002269 self.assertRaises(TypeError,
2270 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002271 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002272 )
2273
2274 self.assertRaises(UnicodeDecodeError,
2275 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2276 {0: a, 1: b},
2277 )
2278
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002279 self.assertRaises(UnicodeDecodeError,
2280 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2281 {0: a, 1: b, 2: 0xFFFE},
2282 )
2283
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002284 self.assertEqual(
2285 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2286 {0: a, 1: b}),
2287 ("ab\ufffd", 3)
2288 )
2289
2290 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002291 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2292 {0: a, 1: b, 2: 0xFFFE}),
2293 ("ab\ufffd", 3)
2294 )
2295
2296 self.assertEqual(
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002297 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2298 {0: a, 1: b}),
2299 ("ab\\x02", 3)
2300 )
2301
2302 self.assertEqual(
2303 codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2304 {0: a, 1: b, 2: 0xFFFE}),
2305 ("ab\\x02", 3)
2306 )
2307
2308 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002309 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2310 {0: a, 1: b}),
2311 ("ab", 3)
2312 )
2313
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002314 self.assertEqual(
2315 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2316 {0: a, 1: b, 2: 0xFFFE}),
2317 ("ab", 3)
2318 )
2319
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002320
Thomas Wouters89f507f2006-12-13 04:49:30 +00002321class WithStmtTest(unittest.TestCase):
2322 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002323 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002324 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2325 self.assertEqual(ef.read(), b"\xfc")
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +10002326 self.assertTrue(f.closed)
Thomas Wouters89f507f2006-12-13 04:49:30 +00002327
2328 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002329 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002330 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002331 with codecs.StreamReaderWriter(f, info.streamreader,
2332 info.streamwriter, 'strict') as srw:
2333 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002334
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002335class TypesTest(unittest.TestCase):
2336 def test_decode_unicode(self):
2337 # Most decoders don't accept unicode input
2338 decoders = [
2339 codecs.utf_7_decode,
2340 codecs.utf_8_decode,
2341 codecs.utf_16_le_decode,
2342 codecs.utf_16_be_decode,
2343 codecs.utf_16_ex_decode,
2344 codecs.utf_32_decode,
2345 codecs.utf_32_le_decode,
2346 codecs.utf_32_be_decode,
2347 codecs.utf_32_ex_decode,
2348 codecs.latin_1_decode,
2349 codecs.ascii_decode,
2350 codecs.charmap_decode,
2351 ]
2352 if hasattr(codecs, "mbcs_decode"):
2353 decoders.append(codecs.mbcs_decode)
2354 for decoder in decoders:
2355 self.assertRaises(TypeError, decoder, "xxx")
2356
2357 def test_unicode_escape(self):
Martin Panter119e5022016-04-16 09:28:57 +00002358 # Escape-decoding a unicode string is supported and gives the same
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002359 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002360 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2361 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2362 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2363 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002364
Victor Stinnere3b47152011-12-09 20:49:49 +01002365 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2366 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002367 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2368 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002369
2370 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2371 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02002372 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2373 (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
Victor Stinnere3b47152011-12-09 20:49:49 +01002374
Serhiy Storchakad6793772013-01-29 10:20:44 +02002375
2376class UnicodeEscapeTest(unittest.TestCase):
2377 def test_empty(self):
2378 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2379 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2380
2381 def test_raw_encode(self):
2382 encode = codecs.unicode_escape_encode
2383 for b in range(32, 127):
2384 if b != b'\\'[0]:
2385 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2386
2387 def test_raw_decode(self):
2388 decode = codecs.unicode_escape_decode
2389 for b in range(256):
2390 if b != b'\\'[0]:
2391 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2392
2393 def test_escape_encode(self):
2394 encode = codecs.unicode_escape_encode
2395 check = coding_checker(self, encode)
2396 check('\t', br'\t')
2397 check('\n', br'\n')
2398 check('\r', br'\r')
2399 check('\\', br'\\')
2400 for b in range(32):
2401 if chr(b) not in '\t\n\r':
2402 check(chr(b), ('\\x%02x' % b).encode())
2403 for b in range(127, 256):
2404 check(chr(b), ('\\x%02x' % b).encode())
2405 check('\u20ac', br'\u20ac')
2406 check('\U0001d120', br'\U0001d120')
2407
2408 def test_escape_decode(self):
2409 decode = codecs.unicode_escape_decode
2410 check = coding_checker(self, decode)
2411 check(b"[\\\n]", "[]")
2412 check(br'[\"]', '["]')
2413 check(br"[\']", "[']")
2414 check(br"[\\]", r"[\]")
2415 check(br"[\a]", "[\x07]")
2416 check(br"[\b]", "[\x08]")
2417 check(br"[\t]", "[\x09]")
2418 check(br"[\n]", "[\x0a]")
2419 check(br"[\v]", "[\x0b]")
2420 check(br"[\f]", "[\x0c]")
2421 check(br"[\r]", "[\x0d]")
2422 check(br"[\7]", "[\x07]")
2423 check(br"[\8]", r"[\8]")
2424 check(br"[\78]", "[\x078]")
2425 check(br"[\41]", "[!]")
2426 check(br"[\418]", "[!8]")
2427 check(br"[\101]", "[A]")
2428 check(br"[\1010]", "[A0]")
2429 check(br"[\x41]", "[A]")
2430 check(br"[\x410]", "[A0]")
2431 check(br"\u20ac", "\u20ac")
2432 check(br"\U0001d120", "\U0001d120")
2433 for b in range(256):
2434 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2435 check(b'\\' + bytes([b]), '\\' + chr(b))
2436
2437 def test_decode_errors(self):
2438 decode = codecs.unicode_escape_decode
2439 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2440 for i in range(d):
2441 self.assertRaises(UnicodeDecodeError, decode,
2442 b"\\" + c + b"0"*i)
2443 self.assertRaises(UnicodeDecodeError, decode,
2444 b"[\\" + c + b"0"*i + b"]")
2445 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2446 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2447 self.assertEqual(decode(data, "replace"),
2448 ("[\ufffd]\ufffd", len(data)))
2449 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2450 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2451 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2452
2453
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002454class RawUnicodeEscapeTest(unittest.TestCase):
2455 def test_empty(self):
2456 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2457 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2458
2459 def test_raw_encode(self):
2460 encode = codecs.raw_unicode_escape_encode
2461 for b in range(256):
2462 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2463
2464 def test_raw_decode(self):
2465 decode = codecs.raw_unicode_escape_decode
2466 for b in range(256):
2467 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2468
2469 def test_escape_encode(self):
2470 encode = codecs.raw_unicode_escape_encode
2471 check = coding_checker(self, encode)
2472 for b in range(256):
2473 if b not in b'uU':
2474 check('\\' + chr(b), b'\\' + bytes([b]))
2475 check('\u20ac', br'\u20ac')
2476 check('\U0001d120', br'\U0001d120')
2477
2478 def test_escape_decode(self):
2479 decode = codecs.raw_unicode_escape_decode
2480 check = coding_checker(self, decode)
2481 for b in range(256):
2482 if b not in b'uU':
2483 check(b'\\' + bytes([b]), '\\' + chr(b))
2484 check(br"\u20ac", "\u20ac")
2485 check(br"\U0001d120", "\U0001d120")
2486
2487 def test_decode_errors(self):
2488 decode = codecs.raw_unicode_escape_decode
2489 for c, d in (b'u', 4), (b'U', 4):
2490 for i in range(d):
2491 self.assertRaises(UnicodeDecodeError, decode,
2492 b"\\" + c + b"0"*i)
2493 self.assertRaises(UnicodeDecodeError, decode,
2494 b"[\\" + c + b"0"*i + b"]")
2495 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2496 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2497 self.assertEqual(decode(data, "replace"),
2498 ("[\ufffd]\ufffd", len(data)))
2499 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2500 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2501 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2502
2503
Martin v. Löwis43c57782009-05-10 08:15:24 +00002504class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002505
2506 def test_utf8(self):
2507 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002508 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002509 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002510 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002511 b"foo\x80bar")
2512 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002513 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002514 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002515 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002516 b"\xed\xb0\x80")
2517
2518 def test_ascii(self):
2519 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002520 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002521 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002522 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002523 b"foo\x80bar")
2524
2525 def test_charmap(self):
2526 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002527 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002528 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002529 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002530 b"foo\xa5bar")
2531
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002532 def test_latin1(self):
2533 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002534 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002535 b"\xe4\xeb\xef\xf6\xfc")
2536
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002537
Victor Stinner3fed0872010-05-22 02:16:27 +00002538class BomTest(unittest.TestCase):
2539 def test_seek0(self):
2540 data = "1234567890"
2541 tests = ("utf-16",
2542 "utf-16-le",
2543 "utf-16-be",
2544 "utf-32",
2545 "utf-32-le",
2546 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002547 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002548 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002549 # Check if the BOM is written only once
2550 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002551 f.write(data)
2552 f.write(data)
2553 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002554 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002555 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002556 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002557
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002558 # Check that the BOM is written after a seek(0)
2559 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2560 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002561 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002562 f.seek(0)
2563 f.write(data)
2564 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002565 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002566
2567 # (StreamWriter) Check that the BOM is written after a seek(0)
2568 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002569 f.writer.write(data[0])
2570 self.assertNotEqual(f.writer.tell(), 0)
2571 f.writer.seek(0)
2572 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002573 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002574 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002575
Victor Stinner05010702011-05-27 16:50:40 +02002576 # Check that the BOM is not written after a seek() at a position
2577 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002578 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2579 f.write(data)
2580 f.seek(f.tell())
2581 f.write(data)
2582 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002583 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002584
Victor Stinner05010702011-05-27 16:50:40 +02002585 # (StreamWriter) Check that the BOM is not written after a seek()
2586 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002587 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002588 f.writer.write(data)
2589 f.writer.seek(f.writer.tell())
2590 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002591 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002592 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002593
Victor Stinner3fed0872010-05-22 02:16:27 +00002594
Georg Brandl02524622010-12-02 18:06:51 +00002595bytes_transform_encodings = [
2596 "base64_codec",
2597 "uu_codec",
2598 "quopri_codec",
2599 "hex_codec",
2600]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002601
2602transform_aliases = {
2603 "base64_codec": ["base64", "base_64"],
2604 "uu_codec": ["uu"],
2605 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2606 "hex_codec": ["hex"],
2607 "rot_13": ["rot13"],
2608}
2609
Georg Brandl02524622010-12-02 18:06:51 +00002610try:
2611 import zlib
2612except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002613 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002614else:
2615 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002616 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002617try:
2618 import bz2
2619except ImportError:
2620 pass
2621else:
2622 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002623 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002624
2625class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002626
Georg Brandl02524622010-12-02 18:06:51 +00002627 def test_basics(self):
2628 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002629 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002630 with self.subTest(encoding=encoding):
2631 # generic codecs interface
2632 (o, size) = codecs.getencoder(encoding)(binput)
2633 self.assertEqual(size, len(binput))
2634 (i, size) = codecs.getdecoder(encoding)(o)
2635 self.assertEqual(size, len(o))
2636 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002637
Georg Brandl02524622010-12-02 18:06:51 +00002638 def test_read(self):
2639 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002640 with self.subTest(encoding=encoding):
2641 sin = codecs.encode(b"\x80", encoding)
2642 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2643 sout = reader.read()
2644 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002645
2646 def test_readline(self):
2647 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002648 with self.subTest(encoding=encoding):
2649 sin = codecs.encode(b"\x80", encoding)
2650 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2651 sout = reader.readline()
2652 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002653
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002654 def test_buffer_api_usage(self):
2655 # We check all the transform codecs accept memoryview input
2656 # for encoding and decoding
2657 # and also that they roundtrip correctly
2658 original = b"12345\x80"
2659 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002660 with self.subTest(encoding=encoding):
2661 data = original
2662 view = memoryview(data)
2663 data = codecs.encode(data, encoding)
2664 view_encoded = codecs.encode(view, encoding)
2665 self.assertEqual(view_encoded, data)
2666 view = memoryview(data)
2667 data = codecs.decode(data, encoding)
2668 self.assertEqual(data, original)
2669 view_decoded = codecs.decode(view, encoding)
2670 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002671
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002672 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002673 # Check binary -> binary codecs give a good error for str input
2674 bad_input = "bad input type"
2675 for encoding in bytes_transform_encodings:
2676 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002677 fmt = ( "{!r} is not a text encoding; "
2678 "use codecs.encode\(\) to handle arbitrary codecs")
2679 msg = fmt.format(encoding)
2680 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002681 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002682 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002683
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002684 def test_text_to_binary_blacklists_text_transforms(self):
2685 # Check str.encode gives a good error message for str -> str codecs
2686 msg = (r"^'rot_13' is not a text encoding; "
2687 "use codecs.encode\(\) to handle arbitrary codecs")
2688 with self.assertRaisesRegex(LookupError, msg):
2689 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002690
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002691 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002692 # Check bytes.decode and bytearray.decode give a good error
2693 # message for binary -> binary codecs
2694 data = b"encode first to ensure we meet any format restrictions"
2695 for encoding in bytes_transform_encodings:
2696 with self.subTest(encoding=encoding):
2697 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002698 fmt = (r"{!r} is not a text encoding; "
2699 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002700 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002701 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002702 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002703 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002704 bytearray(encoded_data).decode(encoding)
2705
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002706 def test_binary_to_text_blacklists_text_transforms(self):
2707 # Check str -> str codec gives a good error for binary input
2708 for bad_input in (b"immutable", bytearray(b"mutable")):
2709 with self.subTest(bad_input=bad_input):
2710 msg = (r"^'rot_13' is not a text encoding; "
2711 "use codecs.decode\(\) to handle arbitrary codecs")
2712 with self.assertRaisesRegex(LookupError, msg) as failure:
2713 bad_input.decode("rot_13")
2714 self.assertIsNone(failure.exception.__cause__)
2715
Zachary Wareefa2e042013-12-30 14:54:11 -06002716 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002717 def test_custom_zlib_error_is_wrapped(self):
2718 # Check zlib codec gives a good error for malformed input
2719 msg = "^decoding with 'zlib_codec' codec failed"
2720 with self.assertRaisesRegex(Exception, msg) as failure:
2721 codecs.decode(b"hello", "zlib_codec")
2722 self.assertIsInstance(failure.exception.__cause__,
2723 type(failure.exception))
2724
2725 def test_custom_hex_error_is_wrapped(self):
2726 # Check hex codec gives a good error for malformed input
2727 msg = "^decoding with 'hex_codec' codec failed"
2728 with self.assertRaisesRegex(Exception, msg) as failure:
2729 codecs.decode(b"hello", "hex_codec")
2730 self.assertIsInstance(failure.exception.__cause__,
2731 type(failure.exception))
2732
2733 # Unfortunately, the bz2 module throws OSError, which the codec
2734 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002735
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002736 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2737 def test_aliases(self):
2738 for codec_name, aliases in transform_aliases.items():
2739 expected_name = codecs.lookup(codec_name).name
2740 for alias in aliases:
2741 with self.subTest(alias=alias):
2742 info = codecs.lookup(alias)
2743 self.assertEqual(info.name, expected_name)
2744
Martin Panter06171bd2015-09-12 00:34:28 +00002745 def test_quopri_stateless(self):
2746 # Should encode with quotetabs=True
2747 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2748 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2749 # But should still support unescaped tabs and spaces
2750 unescaped = b"space tab eol\n"
2751 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2752
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002753 def test_uu_invalid(self):
2754 # Missing "begin" line
2755 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2756
Nick Coghlan8b097b42013-11-13 23:49:21 +10002757
2758# The codec system tries to wrap exceptions in order to ensure the error
2759# mentions the operation being performed and the codec involved. We
2760# currently *only* want this to happen for relatively stateless
2761# exceptions, where the only significant information they contain is their
2762# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002763
2764# Use a local codec registry to avoid appearing to leak objects when
Martin Panter119e5022016-04-16 09:28:57 +00002765# registering multiple search functions
Nick Coghlan4e553e22013-11-16 00:35:34 +10002766_TEST_CODECS = {}
2767
2768def _get_test_codec(codec_name):
2769 return _TEST_CODECS.get(codec_name)
2770codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2771
Nick Coghlan8fad1672014-09-15 23:50:44 +12002772try:
2773 # Issue #22166: Also need to clear the internal cache in CPython
2774 from _codecs import _forget_codec
2775except ImportError:
2776 def _forget_codec(codec_name):
2777 pass
2778
2779
Nick Coghlan8b097b42013-11-13 23:49:21 +10002780class ExceptionChainingTest(unittest.TestCase):
2781
2782 def setUp(self):
2783 # There's no way to unregister a codec search function, so we just
2784 # ensure we render this one fairly harmless after the test
2785 # case finishes by using the test case repr as the codec name
2786 # The codecs module normalizes codec names, although this doesn't
2787 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002788 # We also make sure we use a truly unique id for the custom codec
2789 # to avoid issues with the codec cache when running these tests
2790 # multiple times (e.g. when hunting for refleaks)
2791 unique_id = repr(self) + str(id(self))
2792 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2793
2794 # We store the object to raise on the instance because of a bad
2795 # interaction between the codec caching (which means we can't
2796 # recreate the codec entry) and regrtest refleak hunting (which
2797 # runs the same test instance multiple times). This means we
2798 # need to ensure the codecs call back in to the instance to find
2799 # out which exception to raise rather than binding them in a
2800 # closure to an object that may change on the next run
2801 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002802
Nick Coghlan4e553e22013-11-16 00:35:34 +10002803 def tearDown(self):
2804 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002805 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2806 encodings._cache.pop(self.codec_name, None)
2807 try:
2808 _forget_codec(self.codec_name)
2809 except KeyError:
2810 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002811
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002812 def set_codec(self, encode, decode):
2813 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002814 name=self.codec_name)
2815 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002816
2817 @contextlib.contextmanager
2818 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002819 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002820 operation, self.codec_name, exc_type.__name__, msg)
2821 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2822 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002823 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002824 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002825
2826 def raise_obj(self, *args, **kwds):
2827 # Helper to dynamically change the object raised by a test codec
2828 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002829
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002830 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002831 self.obj_to_raise = obj_to_raise
2832 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002833 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002834 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002835 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002836 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002837 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002838 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002839 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002840 codecs.decode(b"bytes input", self.codec_name)
2841
2842 def test_raise_by_type(self):
2843 self.check_wrapped(RuntimeError, "")
2844
2845 def test_raise_by_value(self):
2846 msg = "This should be wrapped"
2847 self.check_wrapped(RuntimeError(msg), msg)
2848
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002849 def test_raise_grandchild_subclass_exact_size(self):
2850 msg = "This should be wrapped"
2851 class MyRuntimeError(RuntimeError):
2852 __slots__ = ()
2853 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2854
2855 def test_raise_subclass_with_weakref_support(self):
2856 msg = "This should be wrapped"
2857 class MyRuntimeError(RuntimeError):
2858 pass
2859 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2860
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002861 def check_not_wrapped(self, obj_to_raise, msg):
2862 def raise_obj(*args, **kwds):
2863 raise obj_to_raise
2864 self.set_codec(raise_obj, raise_obj)
2865 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002866 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002867 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002868 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002869 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002870 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002871 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002872 codecs.decode(b"bytes input", self.codec_name)
2873
2874 def test_init_override_is_not_wrapped(self):
2875 class CustomInit(RuntimeError):
2876 def __init__(self):
2877 pass
2878 self.check_not_wrapped(CustomInit, "")
2879
2880 def test_new_override_is_not_wrapped(self):
2881 class CustomNew(RuntimeError):
2882 def __new__(cls):
2883 return super().__new__(cls)
2884 self.check_not_wrapped(CustomNew, "")
2885
2886 def test_instance_attribute_is_not_wrapped(self):
2887 msg = "This should NOT be wrapped"
2888 exc = RuntimeError(msg)
2889 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002890 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002891
2892 def test_non_str_arg_is_not_wrapped(self):
2893 self.check_not_wrapped(RuntimeError(1), "1")
2894
2895 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002896 msg_re = r"^\('a', 'b', 'c'\)$"
2897 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002898
2899 # http://bugs.python.org/issue19609
2900 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002901 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002902 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002903 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002904 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002905 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002906 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002907 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002908 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002909 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002910 codecs.decode(b"bytes input", self.codec_name)
2911
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002912 def test_unflagged_non_text_codec_handling(self):
2913 # The stdlib non-text codecs are now marked so they're
2914 # pre-emptively skipped by the text model related methods
2915 # However, third party codecs won't be flagged, so we still make
2916 # sure the case where an inappropriate output type is produced is
2917 # handled appropriately
2918 def encode_to_str(*args, **kwds):
2919 return "not bytes!", 0
2920 def decode_to_bytes(*args, **kwds):
2921 return b"not str!", 0
2922 self.set_codec(encode_to_str, decode_to_bytes)
2923 # No input or output type checks on the codecs module functions
2924 encoded = codecs.encode(None, self.codec_name)
2925 self.assertEqual(encoded, "not bytes!")
2926 decoded = codecs.decode(None, self.codec_name)
2927 self.assertEqual(decoded, b"not str!")
2928 # Text model methods should complain
2929 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2930 "use codecs.encode\(\) to encode to arbitrary types$")
2931 msg = fmt.format(self.codec_name)
2932 with self.assertRaisesRegex(TypeError, msg):
2933 "str_input".encode(self.codec_name)
2934 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2935 "use codecs.decode\(\) to decode to arbitrary types$")
2936 msg = fmt.format(self.codec_name)
2937 with self.assertRaisesRegex(TypeError, msg):
2938 b"bytes input".decode(self.codec_name)
2939
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002940
Georg Brandl02524622010-12-02 18:06:51 +00002941
Victor Stinner62be4fb2011-10-18 21:46:37 +02002942@unittest.skipUnless(sys.platform == 'win32',
2943 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002944class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002945 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002946 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002947
Victor Stinner3a50e702011-10-18 21:21:00 +02002948 def test_invalid_code_page(self):
2949 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2950 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002951 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2952 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002953
2954 def test_code_page_name(self):
2955 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2956 codecs.code_page_encode, 932, '\xff')
2957 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002958 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002959 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002960 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002961
2962 def check_decode(self, cp, tests):
2963 for raw, errors, expected in tests:
2964 if expected is not None:
2965 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002966 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002967 except UnicodeDecodeError as err:
2968 self.fail('Unable to decode %a from "cp%s" with '
2969 'errors=%r: %s' % (raw, cp, errors, err))
2970 self.assertEqual(decoded[0], expected,
2971 '%a.decode("cp%s", %r)=%a != %a'
2972 % (raw, cp, errors, decoded[0], expected))
2973 # assert 0 <= decoded[1] <= len(raw)
2974 self.assertGreaterEqual(decoded[1], 0)
2975 self.assertLessEqual(decoded[1], len(raw))
2976 else:
2977 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002978 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002979
2980 def check_encode(self, cp, tests):
2981 for text, errors, expected in tests:
2982 if expected is not None:
2983 try:
2984 encoded = codecs.code_page_encode(cp, text, errors)
2985 except UnicodeEncodeError as err:
2986 self.fail('Unable to encode %a to "cp%s" with '
2987 'errors=%r: %s' % (text, cp, errors, err))
2988 self.assertEqual(encoded[0], expected,
2989 '%a.encode("cp%s", %r)=%a != %a'
2990 % (text, cp, errors, encoded[0], expected))
2991 self.assertEqual(encoded[1], len(text))
2992 else:
2993 self.assertRaises(UnicodeEncodeError,
2994 codecs.code_page_encode, cp, text, errors)
2995
2996 def test_cp932(self):
2997 self.check_encode(932, (
2998 ('abc', 'strict', b'abc'),
2999 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003000 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003001 ('\xff', 'strict', None),
3002 ('[\xff]', 'ignore', b'[]'),
3003 ('[\xff]', 'replace', b'[y]'),
3004 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003005 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02003006 ('[\xff]', 'namereplace',
3007 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003008 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003009 ('\udcff', 'strict', None),
3010 ('[\udcff]', 'surrogateescape', b'[\xff]'),
3011 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003012 ))
Victor Stinner9e921882011-10-18 21:55:25 +02003013 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02003014 (b'abc', 'strict', 'abc'),
3015 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3016 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003017 (b'[\xff]', 'strict', None),
3018 (b'[\xff]', 'ignore', '[]'),
3019 (b'[\xff]', 'replace', '[\ufffd]'),
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02003020 (b'[\xff]', 'backslashreplace', '[\\xff]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003021 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003022 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003023 (b'\x81\x00abc', 'strict', None),
3024 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003025 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
Victor Stinnerf2be23d2015-01-26 23:26:11 +01003026 (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02003027 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02003028
3029 def test_cp1252(self):
3030 self.check_encode(1252, (
3031 ('abc', 'strict', b'abc'),
3032 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
3033 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003034 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02003035 ('\u0141', 'strict', None),
3036 ('\u0141', 'ignore', b''),
3037 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03003038 ('\udc98', 'surrogateescape', b'\x98'),
3039 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02003040 ))
3041 self.check_decode(1252, (
3042 (b'abc', 'strict', 'abc'),
3043 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3044 (b'\xff', 'strict', '\xff'),
3045 ))
3046
3047 def test_cp_utf7(self):
3048 cp = 65000
3049 self.check_encode(cp, (
3050 ('abc', 'strict', b'abc'),
3051 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
3052 ('\U0010ffff', 'strict', b'+2//f/w-'),
3053 ('\udc80', 'strict', b'+3IA-'),
3054 ('\ufffd', 'strict', b'+//0-'),
3055 ))
3056 self.check_decode(cp, (
3057 (b'abc', 'strict', 'abc'),
3058 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3059 (b'+2//f/w-', 'strict', '\U0010ffff'),
3060 (b'+3IA-', 'strict', '\udc80'),
3061 (b'+//0-', 'strict', '\ufffd'),
3062 # invalid bytes
3063 (b'[+/]', 'strict', '[]'),
3064 (b'[\xff]', 'strict', '[\xff]'),
3065 ))
3066
Victor Stinner3a50e702011-10-18 21:21:00 +02003067 def test_multibyte_encoding(self):
3068 self.check_decode(932, (
3069 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3070 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3071 ))
3072 self.check_decode(self.CP_UTF8, (
3073 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3074 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3075 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02003076 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02003077 self.check_encode(self.CP_UTF8, (
3078 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3079 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3080 ))
3081
3082 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01003083 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3084 self.assertEqual(decoded, ('', 0))
3085
Victor Stinner3a50e702011-10-18 21:21:00 +02003086 decoded = codecs.code_page_decode(932,
3087 b'\xe9\x80\xe9', 'strict',
3088 False)
3089 self.assertEqual(decoded, ('\u9a3e', 2))
3090
3091 decoded = codecs.code_page_decode(932,
3092 b'\xe9\x80\xe9\x80', 'strict',
3093 False)
3094 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3095
3096 decoded = codecs.code_page_decode(932,
3097 b'abc', 'strict',
3098 False)
3099 self.assertEqual(decoded, ('abc', 3))
3100
3101
Fred Drake2e2be372001-09-20 21:33:42 +00003102if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02003103 unittest.main()