blob: 7d0eeb66fc212664257f64844c7bd3a836567633 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
352 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
353 "[&#56448;]".encode(self.encoding))
354 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
355 "[]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
357 "[?]".encode(self.encoding))
358
359 bom = "".encode(self.encoding)
360 for before, after in [("\U00010fff", "A"), ("[", "]"),
361 ("A", "\U00010fff")]:
362 before_sequence = before.encode(self.encoding)[len(bom):]
363 after_sequence = after.encode(self.encoding)[len(bom):]
364 test_string = before + "\uDC80" + after
365 test_sequence = (bom + before_sequence +
366 self.ill_formed_sequence + after_sequence)
367 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
368 self.encoding)
369 self.assertEqual(test_string.encode(self.encoding,
370 "surrogatepass"),
371 test_sequence)
372 self.assertEqual(test_sequence.decode(self.encoding,
373 "surrogatepass"),
374 test_string)
375 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
376 before + after)
377 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
378 before + self.ill_formed_sequence_replace + after)
379
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200380class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000381 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200382 if sys.byteorder == 'little':
383 ill_formed_sequence = b"\x80\xdc\x00\x00"
384 else:
385 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000386
387 spamle = (b'\xff\xfe\x00\x00'
388 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
389 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
390 spambe = (b'\x00\x00\xfe\xff'
391 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
392 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
393
394 def test_only_one_bom(self):
395 _,_,reader,writer = codecs.lookup(self.encoding)
396 # encode some stream
397 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200398 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000399 f.write("spam")
400 f.write("spam")
401 d = s.getvalue()
402 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000403 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 # try to read it back
405 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200406 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000407 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000408
409 def test_badbom(self):
410 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000412 self.assertRaises(UnicodeError, f.read)
413
414 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200415 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000416 self.assertRaises(UnicodeError, f.read)
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "", # first byte of BOM read
423 "", # second byte of BOM read
424 "", # third byte of BOM read
425 "", # fourth byte of BOM read => byteorder known
426 "",
427 "",
428 "",
429 "\x00",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00\xff",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff\u0100",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200442 "\x00\xff\u0100\uffff",
443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000446 ]
447 )
448
Georg Brandl791f4e12009-09-17 11:41:24 +0000449 def test_handlers(self):
450 self.assertEqual(('\ufffd', 1),
451 codecs.utf_32_decode(b'\x01', 'replace', True))
452 self.assertEqual(('', 1),
453 codecs.utf_32_decode(b'\x01', 'ignore', True))
454
Walter Dörwald41980ca2007-08-16 21:55:45 +0000455 def test_errors(self):
456 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
457 b"\xff", "strict", True)
458
459 def test_decoder_state(self):
460 self.check_state_handling_decode(self.encoding,
461 "spamspam", self.spamle)
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spambe)
464
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000465 def test_issue8941(self):
466 # Issue #8941: insufficient result allocation when decoding into
467 # surrogate pairs on UCS-2 builds.
468 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
469 self.assertEqual('\U00010000' * 1024,
470 codecs.utf_32_decode(encoded_le)[0])
471 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
472 self.assertEqual('\U00010000' * 1024,
473 codecs.utf_32_decode(encoded_be)[0])
474
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200475class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000476 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200477 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478
479 def test_partial(self):
480 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200481 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000482 [
483 "",
484 "",
485 "",
486 "\x00",
487 "\x00",
488 "\x00",
489 "\x00",
490 "\x00\xff",
491 "\x00\xff",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff\u0100",
495 "\x00\xff\u0100",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200499 "\x00\xff\u0100\uffff",
500 "\x00\xff\u0100\uffff",
501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000503 ]
504 )
505
506 def test_simple(self):
507 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
508
509 def test_errors(self):
510 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
511 b"\xff", "strict", True)
512
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000513 def test_issue8941(self):
514 # Issue #8941: insufficient result allocation when decoding into
515 # surrogate pairs on UCS-2 builds.
516 encoded = b'\x00\x00\x01\x00' * 1024
517 self.assertEqual('\U00010000' * 1024,
518 codecs.utf_32_le_decode(encoded)[0])
519
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200520class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000521 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200522 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523
524 def test_partial(self):
525 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200526 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000527 [
528 "",
529 "",
530 "",
531 "\x00",
532 "\x00",
533 "\x00",
534 "\x00",
535 "\x00\xff",
536 "\x00\xff",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff\u0100",
540 "\x00\xff\u0100",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200544 "\x00\xff\u0100\uffff",
545 "\x00\xff\u0100\uffff",
546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000548 ]
549 )
550
551 def test_simple(self):
552 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
553
554 def test_errors(self):
555 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
556 b"\xff", "strict", True)
557
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000558 def test_issue8941(self):
559 # Issue #8941: insufficient result allocation when decoding into
560 # surrogate pairs on UCS-2 builds.
561 encoded = b'\x00\x01\x00\x00' * 1024
562 self.assertEqual('\U00010000' * 1024,
563 codecs.utf_32_be_decode(encoded)[0])
564
565
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200566class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000567 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200568 if sys.byteorder == 'little':
569 ill_formed_sequence = b"\x80\xdc"
570 else:
571 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000572
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000573 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
574 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000575
576 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000578 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000579 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200580 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000581 f.write("spam")
582 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000583 d = s.getvalue()
584 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000585 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000586 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000587 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200588 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000589 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000590
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000591 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000592 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200593 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000594 self.assertRaises(UnicodeError, f.read)
595
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000596 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200597 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000598 self.assertRaises(UnicodeError, f.read)
599
Walter Dörwald69652032004-09-07 20:24:22 +0000600 def test_partial(self):
601 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200602 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000603 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000604 "", # first byte of BOM read
605 "", # second byte of BOM read => byteorder known
606 "",
607 "\x00",
608 "\x00",
609 "\x00\xff",
610 "\x00\xff",
611 "\x00\xff\u0100",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff",
615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000618 ]
619 )
620
Georg Brandl791f4e12009-09-17 11:41:24 +0000621 def test_handlers(self):
622 self.assertEqual(('\ufffd', 1),
623 codecs.utf_16_decode(b'\x01', 'replace', True))
624 self.assertEqual(('', 1),
625 codecs.utf_16_decode(b'\x01', 'ignore', True))
626
Walter Dörwalde22d3392005-11-17 08:52:34 +0000627 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000628 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000629 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630
631 def test_decoder_state(self):
632 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000637 def test_bug691291(self):
638 # Files are always opened in binary mode, even if no binary mode was
639 # specified. This means that no automatic conversion of '\n' is done
640 # on reading and writing.
641 s1 = 'Hello\r\nworld\r\n'
642
643 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200644 self.addCleanup(support.unlink, support.TESTFN)
645 with open(support.TESTFN, 'wb') as fp:
646 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200647 with support.check_warnings(('', DeprecationWarning)):
648 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
649 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200650 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000651
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200652class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000653 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000655
656 def test_partial(self):
657 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200658 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000659 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "",
661 "\x00",
662 "\x00",
663 "\x00\xff",
664 "\x00\xff",
665 "\x00\xff\u0100",
666 "\x00\xff\u0100",
667 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200668 "\x00\xff\u0100\uffff",
669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000672 ]
673 )
674
Walter Dörwalde22d3392005-11-17 08:52:34 +0000675 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200676 tests = [
677 (b'\xff', '\ufffd'),
678 (b'A\x00Z', 'A\ufffd'),
679 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
680 (b'\x00\xd8', '\ufffd'),
681 (b'\x00\xd8A', '\ufffd'),
682 (b'\x00\xd8A\x00', '\ufffdA'),
683 (b'\x00\xdcA\x00', '\ufffdA'),
684 ]
685 for raw, expected in tests:
686 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
687 raw, 'strict', True)
688 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000689
Victor Stinner53a9dd72010-12-08 22:25:45 +0000690 def test_nonbmp(self):
691 self.assertEqual("\U00010203".encode(self.encoding),
692 b'\x00\xd8\x03\xde')
693 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
694 "\U00010203")
695
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200696class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000697 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200698 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000699
700 def test_partial(self):
701 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200702 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000703 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000704 "",
705 "\x00",
706 "\x00",
707 "\x00\xff",
708 "\x00\xff",
709 "\x00\xff\u0100",
710 "\x00\xff\u0100",
711 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200712 "\x00\xff\u0100\uffff",
713 "\x00\xff\u0100\uffff",
714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000716 ]
717 )
718
Walter Dörwalde22d3392005-11-17 08:52:34 +0000719 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200720 tests = [
721 (b'\xff', '\ufffd'),
722 (b'\x00A\xff', 'A\ufffd'),
723 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
724 (b'\xd8\x00', '\ufffd'),
725 (b'\xd8\x00\xdc', '\ufffd'),
726 (b'\xd8\x00\x00A', '\ufffdA'),
727 (b'\xdc\x00\x00A', '\ufffdA'),
728 ]
729 for raw, expected in tests:
730 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
731 raw, 'strict', True)
732 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000733
Victor Stinner53a9dd72010-12-08 22:25:45 +0000734 def test_nonbmp(self):
735 self.assertEqual("\U00010203".encode(self.encoding),
736 b'\xd8\x00\xde\x03')
737 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
738 "\U00010203")
739
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200740class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000741 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200742 ill_formed_sequence = b"\xed\xb2\x80"
743 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000744
745 def test_partial(self):
746 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200747 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000748 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000749 "\x00",
750 "\x00",
751 "\x00\xff",
752 "\x00\xff",
753 "\x00\xff\u07ff",
754 "\x00\xff\u07ff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff\u0800",
757 "\x00\xff\u07ff\u0800",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200760 "\x00\xff\u07ff\u0800\uffff",
761 "\x00\xff\u07ff\u0800\uffff",
762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000764 ]
765 )
766
Walter Dörwald3abcb012007-04-16 22:10:50 +0000767 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000768 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 self.check_state_handling_decode(self.encoding,
770 u, u.encode(self.encoding))
771
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000772 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200773 super().test_lone_surrogates()
774 # not sure if this is making sense for
775 # UTF-16 and UTF-32
776 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000777 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000778
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000779 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000780 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
781 b"abc\xed\xa0\x80def")
782 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
783 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
785 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
786 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
787 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000788 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700789 with self.assertRaises(UnicodeDecodeError):
790 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000793
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200794@unittest.skipUnless(sys.platform == 'win32',
795 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200796class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200797 encoding = "cp65001"
798
799 def test_encode(self):
800 tests = [
801 ('abc', 'strict', b'abc'),
802 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
803 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
804 ]
805 if VISTA_OR_LATER:
806 tests.extend((
807 ('\udc80', 'strict', None),
808 ('\udc80', 'ignore', b''),
809 ('\udc80', 'replace', b'?'),
810 ('\udc80', 'backslashreplace', b'\\udc80'),
811 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
812 ))
813 else:
814 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
815 for text, errors, expected in tests:
816 if expected is not None:
817 try:
818 encoded = text.encode('cp65001', errors)
819 except UnicodeEncodeError as err:
820 self.fail('Unable to encode %a to cp65001 with '
821 'errors=%r: %s' % (text, errors, err))
822 self.assertEqual(encoded, expected,
823 '%a.encode("cp65001", %r)=%a != %a'
824 % (text, errors, encoded, expected))
825 else:
826 self.assertRaises(UnicodeEncodeError,
827 text.encode, "cp65001", errors)
828
829 def test_decode(self):
830 tests = [
831 (b'abc', 'strict', 'abc'),
832 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
833 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
834 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
835 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
836 # invalid bytes
837 (b'[\xff]', 'strict', None),
838 (b'[\xff]', 'ignore', '[]'),
839 (b'[\xff]', 'replace', '[\ufffd]'),
840 (b'[\xff]', 'surrogateescape', '[\udcff]'),
841 ]
842 if VISTA_OR_LATER:
843 tests.extend((
844 (b'[\xed\xb2\x80]', 'strict', None),
845 (b'[\xed\xb2\x80]', 'ignore', '[]'),
846 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
847 ))
848 else:
849 tests.extend((
850 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
851 ))
852 for raw, errors, expected in tests:
853 if expected is not None:
854 try:
855 decoded = raw.decode('cp65001', errors)
856 except UnicodeDecodeError as err:
857 self.fail('Unable to decode %a from cp65001 with '
858 'errors=%r: %s' % (raw, errors, err))
859 self.assertEqual(decoded, expected,
860 '%a.decode("cp65001", %r)=%a != %a'
861 % (raw, errors, decoded, expected))
862 else:
863 self.assertRaises(UnicodeDecodeError,
864 raw.decode, 'cp65001', errors)
865
866 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
867 def test_lone_surrogates(self):
868 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
869 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
870 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
871 b'[\\udc80]')
872 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
873 b'[&#56448;]')
874 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
875 b'[\x80]')
876 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
877 b'[]')
878 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
879 b'[?]')
880
881 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
882 def test_surrogatepass_handler(self):
883 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
884 b"abc\xed\xa0\x80def")
885 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
886 "abc\ud800def")
887 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
888 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
889 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
890 "\U00010fff\uD800")
891 self.assertTrue(codecs.lookup_error("surrogatepass"))
892
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200893
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200894class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000895 encoding = "utf-7"
896
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000897 def test_partial(self):
898 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200899 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000900 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200901 'a',
902 'a',
903 'a+',
904 'a+-',
905 'a+-b',
906 'a+-b',
907 'a+-b',
908 'a+-b',
909 'a+-b',
910 'a+-b\x00',
911 'a+-b\x00c',
912 'a+-b\x00c',
913 'a+-b\x00c',
914 'a+-b\x00c',
915 'a+-b\x00c',
916 'a+-b\x00c\x80',
917 'a+-b\x00c\x80d',
918 'a+-b\x00c\x80d',
919 'a+-b\x00c\x80d',
920 'a+-b\x00c\x80d',
921 'a+-b\x00c\x80d',
922 'a+-b\x00c\x80d\u0100',
923 'a+-b\x00c\x80d\u0100e',
924 'a+-b\x00c\x80d\u0100e',
925 'a+-b\x00c\x80d\u0100e',
926 'a+-b\x00c\x80d\u0100e',
927 'a+-b\x00c\x80d\u0100e',
928 'a+-b\x00c\x80d\u0100e',
929 'a+-b\x00c\x80d\u0100e',
930 'a+-b\x00c\x80d\u0100e',
931 'a+-b\x00c\x80d\u0100e\U00010000',
932 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000933 ]
934 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000935
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300936 def test_errors(self):
937 tests = [
938 (b'a\xffb', 'a\ufffdb'),
939 (b'a+IK', 'a\ufffd'),
940 (b'a+IK-b', 'a\ufffdb'),
941 (b'a+IK,b', 'a\ufffdb'),
942 (b'a+IKx', 'a\u20ac\ufffd'),
943 (b'a+IKx-b', 'a\u20ac\ufffdb'),
944 (b'a+IKwgr', 'a\u20ac\ufffd'),
945 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
946 (b'a+IKwgr,', 'a\u20ac\ufffd'),
947 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
948 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
949 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
950 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
951 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
952 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
953 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
954 ]
955 for raw, expected in tests:
956 with self.subTest(raw=raw):
957 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
958 raw, 'strict', True)
959 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
960
961 def test_nonbmp(self):
962 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
963 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
964 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
965
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200966 test_lone_surrogates = None
967
968
Walter Dörwalde22d3392005-11-17 08:52:34 +0000969class UTF16ExTest(unittest.TestCase):
970
971 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000972 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000973
974 def test_bad_args(self):
975 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
976
977class ReadBufferTest(unittest.TestCase):
978
979 def test_array(self):
980 import array
981 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000982 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000983 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000984 )
985
986 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000987 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000988
989 def test_bad_args(self):
990 self.assertRaises(TypeError, codecs.readbuffer_encode)
991 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
992
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200993class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000994 encoding = "utf-8-sig"
995
996 def test_partial(self):
997 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200998 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000999 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001000 "",
1001 "",
1002 "", # First BOM has been read and skipped
1003 "",
1004 "",
1005 "\ufeff", # Second BOM has been read and emitted
1006 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001007 "\ufeff\x00", # First byte of encoded "\xff" read
1008 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1009 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1010 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001011 "\ufeff\x00\xff\u07ff",
1012 "\ufeff\x00\xff\u07ff",
1013 "\ufeff\x00\xff\u07ff\u0800",
1014 "\ufeff\x00\xff\u07ff\u0800",
1015 "\ufeff\x00\xff\u07ff\u0800",
1016 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001017 "\ufeff\x00\xff\u07ff\u0800\uffff",
1018 "\ufeff\x00\xff\u07ff\u0800\uffff",
1019 "\ufeff\x00\xff\u07ff\u0800\uffff",
1020 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001021 ]
1022 )
1023
Thomas Wouters89f507f2006-12-13 04:49:30 +00001024 def test_bug1601501(self):
1025 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001026 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001027
Walter Dörwald3abcb012007-04-16 22:10:50 +00001028 def test_bom(self):
1029 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001030 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001031 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1032
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001033 def test_stream_bom(self):
1034 unistring = "ABC\u00A1\u2200XYZ"
1035 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1036
1037 reader = codecs.getreader("utf-8-sig")
1038 for sizehint in [None] + list(range(1, 11)) + \
1039 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001040 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001041 ostream = io.StringIO()
1042 while 1:
1043 if sizehint is not None:
1044 data = istream.read(sizehint)
1045 else:
1046 data = istream.read()
1047
1048 if not data:
1049 break
1050 ostream.write(data)
1051
1052 got = ostream.getvalue()
1053 self.assertEqual(got, unistring)
1054
1055 def test_stream_bare(self):
1056 unistring = "ABC\u00A1\u2200XYZ"
1057 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1058
1059 reader = codecs.getreader("utf-8-sig")
1060 for sizehint in [None] + list(range(1, 11)) + \
1061 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001062 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001063 ostream = io.StringIO()
1064 while 1:
1065 if sizehint is not None:
1066 data = istream.read(sizehint)
1067 else:
1068 data = istream.read()
1069
1070 if not data:
1071 break
1072 ostream.write(data)
1073
1074 got = ostream.getvalue()
1075 self.assertEqual(got, unistring)
1076
1077class EscapeDecodeTest(unittest.TestCase):
1078 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001079 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001080
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001081 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001082 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001083 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001084 b = bytes([b])
1085 if b != b'\\':
1086 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001087
1088 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001089 decode = codecs.escape_decode
1090 check = coding_checker(self, decode)
1091 check(b"[\\\n]", b"[]")
1092 check(br'[\"]', b'["]')
1093 check(br"[\']", b"[']")
1094 check(br"[\\]", br"[\]")
1095 check(br"[\a]", b"[\x07]")
1096 check(br"[\b]", b"[\x08]")
1097 check(br"[\t]", b"[\x09]")
1098 check(br"[\n]", b"[\x0a]")
1099 check(br"[\v]", b"[\x0b]")
1100 check(br"[\f]", b"[\x0c]")
1101 check(br"[\r]", b"[\x0d]")
1102 check(br"[\7]", b"[\x07]")
1103 check(br"[\8]", br"[\8]")
1104 check(br"[\78]", b"[\x078]")
1105 check(br"[\41]", b"[!]")
1106 check(br"[\418]", b"[!8]")
1107 check(br"[\101]", b"[A]")
1108 check(br"[\1010]", b"[A0]")
1109 check(br"[\501]", b"[A]")
1110 check(br"[\x41]", b"[A]")
1111 check(br"[\X41]", br"[\X41]")
1112 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001113 for b in range(256):
1114 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001115 b = bytes([b])
1116 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001117
1118 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001119 decode = codecs.escape_decode
1120 self.assertRaises(ValueError, decode, br"\x")
1121 self.assertRaises(ValueError, decode, br"[\x]")
1122 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1123 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1124 self.assertRaises(ValueError, decode, br"\x0")
1125 self.assertRaises(ValueError, decode, br"[\x0]")
1126 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1127 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001128
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001129class RecodingTest(unittest.TestCase):
1130 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001131 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001132 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001133 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001134 f2.close()
1135 # Python used to crash on this at exit because of a refcount
1136 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001137
Martin v. Löwis2548c732003-04-18 10:39:54 +00001138# From RFC 3492
1139punycode_testcases = [
1140 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001141 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1142 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001143 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001145 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001146 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001147 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001148 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001149 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001151 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1152 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1153 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001154 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001155 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1157 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1158 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001159 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001160 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001162 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1163 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1164 "\u0939\u0948\u0902",
1165 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001166
1167 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001169 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1170 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001171
1172 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1174 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1175 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001176 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1177 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
1179 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1181 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1182 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1183 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001184 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185
1186 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001187 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1188 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1189 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1190 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1191 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001192 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193
1194 # (K) Vietnamese:
1195 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1196 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001197 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1198 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1199 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1200 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001201 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001202
Martin v. Löwis2548c732003-04-18 10:39:54 +00001203 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001204 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001205 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001206
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001208 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1209 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1210 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001211 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212
1213 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001214 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1215 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1216 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001217 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218
1219 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001220 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001221 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222
1223 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001224 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1225 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001226 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227
1228 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001229 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231
1232 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001233 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001234 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235
1236 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001237 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1238 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001239 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240 ]
1241
1242for i in punycode_testcases:
1243 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001244 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245
1246class PunycodeTest(unittest.TestCase):
1247 def test_encode(self):
1248 for uni, puny in punycode_testcases:
1249 # Need to convert both strings to lower case, since
1250 # some of the extended encodings use upper case, but our
1251 # code produces only lower case. Converting just puny to
1252 # lower is also insufficient, since some of the input characters
1253 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001254 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001255 str(uni.encode("punycode"), "ascii").lower(),
1256 str(puny, "ascii").lower()
1257 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001258
1259 def test_decode(self):
1260 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001261 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001262 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001263 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001264
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001265class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001266 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001267 def test_bug1251300(self):
1268 # Decoding with unicode_internal used to not correctly handle "code
1269 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001270 ok = [
1271 (b"\x00\x10\xff\xff", "\U0010ffff"),
1272 (b"\x00\x00\x01\x01", "\U00000101"),
1273 (b"", ""),
1274 ]
1275 not_ok = [
1276 b"\x7f\xff\xff\xff",
1277 b"\x80\x00\x00\x00",
1278 b"\x81\x00\x00\x00",
1279 b"\x00",
1280 b"\x00\x00\x00\x00\x00",
1281 ]
1282 for internal, uni in ok:
1283 if sys.byteorder == "little":
1284 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001285 with support.check_warnings():
1286 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001287 for internal in not_ok:
1288 if sys.byteorder == "little":
1289 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001290 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001291 'deprecated', DeprecationWarning)):
1292 self.assertRaises(UnicodeDecodeError, internal.decode,
1293 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001294 if sys.byteorder == "little":
1295 invalid = b"\x00\x00\x11\x00"
1296 else:
1297 invalid = b"\x00\x11\x00\x00"
1298 with support.check_warnings():
1299 self.assertRaises(UnicodeDecodeError,
1300 invalid.decode, "unicode_internal")
1301 with support.check_warnings():
1302 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1303 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001304
Victor Stinner182d90d2011-09-29 19:53:55 +02001305 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001306 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001307 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001308 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001309 'deprecated', DeprecationWarning)):
1310 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001311 except UnicodeDecodeError as ex:
1312 self.assertEqual("unicode_internal", ex.encoding)
1313 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1314 self.assertEqual(4, ex.start)
1315 self.assertEqual(8, ex.end)
1316 else:
1317 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001318
Victor Stinner182d90d2011-09-29 19:53:55 +02001319 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001320 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001321 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1322 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001323 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001324 'deprecated', DeprecationWarning)):
1325 ab = "ab".encode("unicode_internal").decode()
1326 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1327 "ascii"),
1328 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001329 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001330
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001331 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001332 with support.check_warnings(('unicode_internal codec has been '
1333 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001334 # Issue 3739
1335 encoder = codecs.getencoder("unicode_internal")
1336 self.assertEqual(encoder("a")[1], 1)
1337 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1338
1339 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001340
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1342nameprep_tests = [
1343 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001344 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1345 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1346 b'\xb8\x8f\xef\xbb\xbf',
1347 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001349 (b'CAFE',
1350 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001351 # 3.3 Case folding 8bit U+00DF (german sharp s).
1352 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'\xc3\x9f',
1354 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'\xc4\xb0',
1357 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001359 (b'\xc5\x83\xcd\xba',
1360 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1362 # XXX: skip this as it fails in UCS-2 mode
1363 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1364 # 'telc\xe2\x88\x95kg\xcf\x83'),
1365 (None, None),
1366 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001367 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1368 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'\xe1\xbe\xb7',
1371 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372 # 3.9 Self-reverting case folding U+01F0 and normalization.
1373 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xc7\xb0',
1375 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xce\x90',
1378 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xce\xb0',
1381 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xe1\xba\x96',
1384 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xe1\xbd\x96',
1387 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b' ',
1390 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xc2\xa0',
1393 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 None),
1397 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xe2\x80\x80',
1399 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xe2\x80\x8b',
1402 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xe3\x80\x80',
1405 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001406 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\x10\x7f',
1408 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 None),
1412 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 None),
1415 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xef\xbb\xbf',
1417 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001418 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 None),
1424 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 None),
1445 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xcd\x81',
1447 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001448 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 None),
1451 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 None),
1454 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001455 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 None),
1457 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001459 None),
1460 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001461 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001462 None),
1463 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001464 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465 None),
1466 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001467 (b'foo\xef\xb9\xb6bar',
1468 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001469 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001470 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001471 None),
1472 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001473 (b'\xd8\xa71\xd8\xa8',
1474 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001475 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001476 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001477 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001478 # None),
1479 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 # 3.44 Larger test (shrinking).
1481 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1483 b'\xaa\xce\xb0\xe2\x80\x80',
1484 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 # 3.45 Larger test (expanding).
1486 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1488 b'\x80',
1489 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1490 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1491 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001492 ]
1493
1494
1495class NameprepTest(unittest.TestCase):
1496 def test_nameprep(self):
1497 from encodings.idna import nameprep
1498 for pos, (orig, prepped) in enumerate(nameprep_tests):
1499 if orig is None:
1500 # Skipped
1501 continue
1502 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001503 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001504 if prepped is None:
1505 # Input contains prohibited characters
1506 self.assertRaises(UnicodeError, nameprep, orig)
1507 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001508 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001510 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001511 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001512 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001513
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001514class IDNACodecTest(unittest.TestCase):
1515 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001516 self.assertEqual(str(b"python.org", "idna"), "python.org")
1517 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1518 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1519 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001520
1521 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001522 self.assertEqual("python.org".encode("idna"), b"python.org")
1523 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1524 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1525 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001526
Martin v. Löwis8b595142005-08-25 11:03:38 +00001527 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001528 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001529 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001530 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001531
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001532 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001534 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001535 "python.org"
1536 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001537 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001538 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539 "python.org."
1540 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001541 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001542 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001543 "pyth\xf6n.org."
1544 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001546 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001547 "pyth\xf6n.org."
1548 )
1549
1550 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001551 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1552 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1553 self.assertEqual(decoder.decode(b"rg"), "")
1554 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001555
1556 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1558 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1559 self.assertEqual(decoder.decode(b"rg."), "org.")
1560 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561
1562 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001563 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001564 b"".join(codecs.iterencode("python.org", "idna")),
1565 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001566 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001567 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001568 b"".join(codecs.iterencode("python.org.", "idna")),
1569 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001570 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001571 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001572 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1573 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001574 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001576 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1577 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001578 )
1579
1580 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001581 self.assertEqual(encoder.encode("\xe4x"), b"")
1582 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1583 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001584
1585 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001586 self.assertEqual(encoder.encode("\xe4x"), b"")
1587 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1588 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001589
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001590class CodecsModuleTest(unittest.TestCase):
1591
1592 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001593 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1594 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001595 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001596 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001597 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001598
Victor Stinnera57dfd02014-05-14 17:13:14 +02001599 # test keywords
1600 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1601 '\xe4\xf6\xfc')
1602 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1603 '[]')
1604
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001605 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001606 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1607 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001608 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001609 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001610 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001611 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001612
Victor Stinnera57dfd02014-05-14 17:13:14 +02001613 # test keywords
1614 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1615 b'\xe4\xf6\xfc')
1616 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1617 b'[]')
1618
Walter Dörwald063e1e82004-10-28 13:04:26 +00001619 def test_register(self):
1620 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001621 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001622
1623 def test_lookup(self):
1624 self.assertRaises(TypeError, codecs.lookup)
1625 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001626 self.assertRaises(LookupError, codecs.lookup, " ")
1627
1628 def test_getencoder(self):
1629 self.assertRaises(TypeError, codecs.getencoder)
1630 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1631
1632 def test_getdecoder(self):
1633 self.assertRaises(TypeError, codecs.getdecoder)
1634 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1635
1636 def test_getreader(self):
1637 self.assertRaises(TypeError, codecs.getreader)
1638 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1639
1640 def test_getwriter(self):
1641 self.assertRaises(TypeError, codecs.getwriter)
1642 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001643
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001644 def test_lookup_issue1813(self):
1645 # Issue #1813: under Turkish locales, lookup of some codecs failed
1646 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001647 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001648 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1649 try:
1650 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1651 except locale.Error:
1652 # Unsupported locale on this system
1653 self.skipTest('test needs Turkish locale')
1654 c = codecs.lookup('ASCII')
1655 self.assertEqual(c.name, 'ascii')
1656
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001657class StreamReaderTest(unittest.TestCase):
1658
1659 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001660 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001661 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001662
1663 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001664 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001665 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001666
Thomas Wouters89f507f2006-12-13 04:49:30 +00001667class EncodedFileTest(unittest.TestCase):
1668
1669 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001670 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001671 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001672 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001673
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001674 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001675 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001676 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001677 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001678
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001679all_unicode_encodings = [
1680 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001681 "big5",
1682 "big5hkscs",
1683 "charmap",
1684 "cp037",
1685 "cp1006",
1686 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001687 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001688 "cp1140",
1689 "cp1250",
1690 "cp1251",
1691 "cp1252",
1692 "cp1253",
1693 "cp1254",
1694 "cp1255",
1695 "cp1256",
1696 "cp1257",
1697 "cp1258",
1698 "cp424",
1699 "cp437",
1700 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001701 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001702 "cp737",
1703 "cp775",
1704 "cp850",
1705 "cp852",
1706 "cp855",
1707 "cp856",
1708 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001709 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001710 "cp860",
1711 "cp861",
1712 "cp862",
1713 "cp863",
1714 "cp864",
1715 "cp865",
1716 "cp866",
1717 "cp869",
1718 "cp874",
1719 "cp875",
1720 "cp932",
1721 "cp949",
1722 "cp950",
1723 "euc_jis_2004",
1724 "euc_jisx0213",
1725 "euc_jp",
1726 "euc_kr",
1727 "gb18030",
1728 "gb2312",
1729 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001730 "hp_roman8",
1731 "hz",
1732 "idna",
1733 "iso2022_jp",
1734 "iso2022_jp_1",
1735 "iso2022_jp_2",
1736 "iso2022_jp_2004",
1737 "iso2022_jp_3",
1738 "iso2022_jp_ext",
1739 "iso2022_kr",
1740 "iso8859_1",
1741 "iso8859_10",
1742 "iso8859_11",
1743 "iso8859_13",
1744 "iso8859_14",
1745 "iso8859_15",
1746 "iso8859_16",
1747 "iso8859_2",
1748 "iso8859_3",
1749 "iso8859_4",
1750 "iso8859_5",
1751 "iso8859_6",
1752 "iso8859_7",
1753 "iso8859_8",
1754 "iso8859_9",
1755 "johab",
1756 "koi8_r",
1757 "koi8_u",
1758 "latin_1",
1759 "mac_cyrillic",
1760 "mac_greek",
1761 "mac_iceland",
1762 "mac_latin2",
1763 "mac_roman",
1764 "mac_turkish",
1765 "palmos",
1766 "ptcp154",
1767 "punycode",
1768 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001769 "shift_jis",
1770 "shift_jis_2004",
1771 "shift_jisx0213",
1772 "tis_620",
1773 "unicode_escape",
1774 "unicode_internal",
1775 "utf_16",
1776 "utf_16_be",
1777 "utf_16_le",
1778 "utf_7",
1779 "utf_8",
1780]
1781
1782if hasattr(codecs, "mbcs_encode"):
1783 all_unicode_encodings.append("mbcs")
1784
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001785# The following encoding is not tested, because it's not supposed
1786# to work:
1787# "undefined"
1788
1789# The following encodings don't work in stateful mode
1790broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001791 "punycode",
1792 "unicode_internal"
1793]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001794broken_incremental_coders = broken_unicode_with_streams + [
1795 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001796]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001797
Walter Dörwald3abcb012007-04-16 22:10:50 +00001798class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001799 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001800 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001801 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001802 name = codecs.lookup(encoding).name
1803 if encoding.endswith("_codec"):
1804 name += "_codec"
1805 elif encoding == "latin_1":
1806 name = "latin_1"
1807 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001808
Ezio Melottiadc417c2011-11-17 12:23:34 +02001809 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001810 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001811 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001812 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001813 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001814 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001815
1816 if encoding not in broken_unicode_with_streams:
1817 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001818 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001819 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001820 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001821 for c in s:
1822 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001823 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001824 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001825 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001826 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001827 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001828 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001829 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001830 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001831 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001832 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001833
Thomas Wouters89f507f2006-12-13 04:49:30 +00001834 if encoding not in broken_incremental_coders:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001835 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001836 try:
1837 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001838 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001839 pass
1840 else:
1841 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001842 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001843 for c in s:
1844 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001845 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001846 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001847 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001848 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001849 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001850 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001851 self.assertEqual(decodedresult, s,
1852 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001853
1854 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001855 result = "".join(codecs.iterdecode(
1856 codecs.iterencode(s, encoding), encoding))
1857 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001858
1859 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001860 result = "".join(codecs.iterdecode(
1861 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001862 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001863
Victor Stinner554f3f02010-06-16 23:33:54 +00001864 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001865 # check incremental decoder/encoder with errors argument
1866 try:
1867 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001868 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001869 pass
1870 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001871 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001872 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001873 decodedresult = "".join(decoder.decode(bytes([c]))
1874 for c in encodedresult)
1875 self.assertEqual(decodedresult, s,
1876 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001877
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001878 @support.cpython_only
1879 def test_basics_capi(self):
1880 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1881 s = "abc123" # all codecs should be able to encode these
1882 for encoding in all_unicode_encodings:
1883 if encoding not in broken_incremental_coders:
1884 # check incremental decoder/encoder (fetched via the C API)
1885 try:
1886 cencoder = codec_incrementalencoder(encoding)
1887 except LookupError: # no IncrementalEncoder
1888 pass
1889 else:
1890 # check C API
1891 encodedresult = b""
1892 for c in s:
1893 encodedresult += cencoder.encode(c)
1894 encodedresult += cencoder.encode("", True)
1895 cdecoder = codec_incrementaldecoder(encoding)
1896 decodedresult = ""
1897 for c in encodedresult:
1898 decodedresult += cdecoder.decode(bytes([c]))
1899 decodedresult += cdecoder.decode(b"", True)
1900 self.assertEqual(decodedresult, s,
1901 "encoding=%r" % encoding)
1902
1903 if encoding not in ("idna", "mbcs"):
1904 # check incremental decoder/encoder with errors argument
1905 try:
1906 cencoder = codec_incrementalencoder(encoding, "ignore")
1907 except LookupError: # no IncrementalEncoder
1908 pass
1909 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001910 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001911 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1912 decodedresult = "".join(cdecoder.decode(bytes([c]))
1913 for c in encodedresult)
1914 self.assertEqual(decodedresult, s,
1915 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001916
Walter Dörwald729c31f2005-03-14 19:06:30 +00001917 def test_seek(self):
1918 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001919 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001920 for encoding in all_unicode_encodings:
1921 if encoding == "idna": # FIXME: See SF bug #1163178
1922 continue
1923 if encoding in broken_unicode_with_streams:
1924 continue
Victor Stinner05010702011-05-27 16:50:40 +02001925 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001926 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001927 # Test that calling seek resets the internal codec state and buffers
1928 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001929 data = reader.read()
1930 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001931
Walter Dörwalde22d3392005-11-17 08:52:34 +00001932 def test_bad_decode_args(self):
1933 for encoding in all_unicode_encodings:
1934 decoder = codecs.getdecoder(encoding)
1935 self.assertRaises(TypeError, decoder)
1936 if encoding not in ("idna", "punycode"):
1937 self.assertRaises(TypeError, decoder, 42)
1938
1939 def test_bad_encode_args(self):
1940 for encoding in all_unicode_encodings:
1941 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001942 with support.check_warnings():
1943 # unicode-internal has been deprecated
1944 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001945
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001946 def test_encoding_map_type_initialized(self):
1947 from encodings import cp1140
1948 # This used to crash, we are only verifying there's no crash.
1949 table_type = type(cp1140.encoding_table)
1950 self.assertEqual(table_type, table_type)
1951
Walter Dörwald3abcb012007-04-16 22:10:50 +00001952 def test_decoder_state(self):
1953 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001954 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001955 for encoding in all_unicode_encodings:
1956 if encoding not in broken_incremental_coders:
1957 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1958 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1959
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001960class CharmapTest(unittest.TestCase):
1961 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001962 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001963 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001964 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001965 )
1966
Ezio Melottib3aedd42010-11-20 19:04:17 +00001967 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001968 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1969 ("\U0010FFFFbc", 3)
1970 )
1971
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001972 self.assertRaises(UnicodeDecodeError,
1973 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1974 )
1975
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001976 self.assertRaises(UnicodeDecodeError,
1977 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1978 )
1979
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001980 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001981 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001982 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001983 )
1984
Ezio Melottib3aedd42010-11-20 19:04:17 +00001985 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001986 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001987 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001988 )
1989
Ezio Melottib3aedd42010-11-20 19:04:17 +00001990 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001991 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001992 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001993 )
1994
Ezio Melottib3aedd42010-11-20 19:04:17 +00001995 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001996 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001997 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001998 )
1999
Guido van Rossum805365e2007-05-07 22:24:25 +00002000 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002001 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002002 codecs.charmap_decode(allbytes, "ignore", ""),
2003 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002004 )
2005
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002006 def test_decode_with_int2str_map(self):
2007 self.assertEqual(
2008 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2009 {0: 'a', 1: 'b', 2: 'c'}),
2010 ("abc", 3)
2011 )
2012
2013 self.assertEqual(
2014 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2015 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2016 ("AaBbCc", 3)
2017 )
2018
2019 self.assertEqual(
2020 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2021 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2022 ("\U0010FFFFbc", 3)
2023 )
2024
2025 self.assertEqual(
2026 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2027 {0: 'a', 1: 'b', 2: ''}),
2028 ("ab", 3)
2029 )
2030
2031 self.assertRaises(UnicodeDecodeError,
2032 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2033 {0: 'a', 1: 'b'}
2034 )
2035
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002036 self.assertRaises(UnicodeDecodeError,
2037 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2038 {0: 'a', 1: 'b', 2: None}
2039 )
2040
2041 # Issue #14850
2042 self.assertRaises(UnicodeDecodeError,
2043 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2044 {0: 'a', 1: 'b', 2: '\ufffe'}
2045 )
2046
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002047 self.assertEqual(
2048 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2049 {0: 'a', 1: 'b'}),
2050 ("ab\ufffd", 3)
2051 )
2052
2053 self.assertEqual(
2054 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2055 {0: 'a', 1: 'b', 2: None}),
2056 ("ab\ufffd", 3)
2057 )
2058
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002059 # Issue #14850
2060 self.assertEqual(
2061 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2062 {0: 'a', 1: 'b', 2: '\ufffe'}),
2063 ("ab\ufffd", 3)
2064 )
2065
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002066 self.assertEqual(
2067 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2068 {0: 'a', 1: 'b'}),
2069 ("ab", 3)
2070 )
2071
2072 self.assertEqual(
2073 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2074 {0: 'a', 1: 'b', 2: None}),
2075 ("ab", 3)
2076 )
2077
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002078 # Issue #14850
2079 self.assertEqual(
2080 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2081 {0: 'a', 1: 'b', 2: '\ufffe'}),
2082 ("ab", 3)
2083 )
2084
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002085 allbytes = bytes(range(256))
2086 self.assertEqual(
2087 codecs.charmap_decode(allbytes, "ignore", {}),
2088 ("", len(allbytes))
2089 )
2090
2091 def test_decode_with_int2int_map(self):
2092 a = ord('a')
2093 b = ord('b')
2094 c = ord('c')
2095
2096 self.assertEqual(
2097 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2098 {0: a, 1: b, 2: c}),
2099 ("abc", 3)
2100 )
2101
2102 # Issue #15379
2103 self.assertEqual(
2104 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2105 {0: 0x10FFFF, 1: b, 2: c}),
2106 ("\U0010FFFFbc", 3)
2107 )
2108
Antoine Pitroua1f76552012-09-23 20:00:04 +02002109 self.assertEqual(
2110 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2111 {0: sys.maxunicode, 1: b, 2: c}),
2112 (chr(sys.maxunicode) + "bc", 3)
2113 )
2114
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002115 self.assertRaises(TypeError,
2116 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002117 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002118 )
2119
2120 self.assertRaises(UnicodeDecodeError,
2121 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2122 {0: a, 1: b},
2123 )
2124
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002125 self.assertRaises(UnicodeDecodeError,
2126 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2127 {0: a, 1: b, 2: 0xFFFE},
2128 )
2129
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002130 self.assertEqual(
2131 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2132 {0: a, 1: b}),
2133 ("ab\ufffd", 3)
2134 )
2135
2136 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002137 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2138 {0: a, 1: b, 2: 0xFFFE}),
2139 ("ab\ufffd", 3)
2140 )
2141
2142 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002143 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2144 {0: a, 1: b}),
2145 ("ab", 3)
2146 )
2147
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002148 self.assertEqual(
2149 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2150 {0: a, 1: b, 2: 0xFFFE}),
2151 ("ab", 3)
2152 )
2153
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002154
Thomas Wouters89f507f2006-12-13 04:49:30 +00002155class WithStmtTest(unittest.TestCase):
2156 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002157 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002158 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2159 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002160
2161 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002162 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002163 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002164 with codecs.StreamReaderWriter(f, info.streamreader,
2165 info.streamwriter, 'strict') as srw:
2166 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002167
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002168class TypesTest(unittest.TestCase):
2169 def test_decode_unicode(self):
2170 # Most decoders don't accept unicode input
2171 decoders = [
2172 codecs.utf_7_decode,
2173 codecs.utf_8_decode,
2174 codecs.utf_16_le_decode,
2175 codecs.utf_16_be_decode,
2176 codecs.utf_16_ex_decode,
2177 codecs.utf_32_decode,
2178 codecs.utf_32_le_decode,
2179 codecs.utf_32_be_decode,
2180 codecs.utf_32_ex_decode,
2181 codecs.latin_1_decode,
2182 codecs.ascii_decode,
2183 codecs.charmap_decode,
2184 ]
2185 if hasattr(codecs, "mbcs_decode"):
2186 decoders.append(codecs.mbcs_decode)
2187 for decoder in decoders:
2188 self.assertRaises(TypeError, decoder, "xxx")
2189
2190 def test_unicode_escape(self):
2191 # Escape-decoding an unicode string is supported ang gives the same
2192 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002193 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2194 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2195 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2196 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002197
Victor Stinnere3b47152011-12-09 20:49:49 +01002198 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2199 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2200
2201 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2202 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2203
Serhiy Storchakad6793772013-01-29 10:20:44 +02002204
2205class UnicodeEscapeTest(unittest.TestCase):
2206 def test_empty(self):
2207 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2208 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2209
2210 def test_raw_encode(self):
2211 encode = codecs.unicode_escape_encode
2212 for b in range(32, 127):
2213 if b != b'\\'[0]:
2214 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2215
2216 def test_raw_decode(self):
2217 decode = codecs.unicode_escape_decode
2218 for b in range(256):
2219 if b != b'\\'[0]:
2220 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2221
2222 def test_escape_encode(self):
2223 encode = codecs.unicode_escape_encode
2224 check = coding_checker(self, encode)
2225 check('\t', br'\t')
2226 check('\n', br'\n')
2227 check('\r', br'\r')
2228 check('\\', br'\\')
2229 for b in range(32):
2230 if chr(b) not in '\t\n\r':
2231 check(chr(b), ('\\x%02x' % b).encode())
2232 for b in range(127, 256):
2233 check(chr(b), ('\\x%02x' % b).encode())
2234 check('\u20ac', br'\u20ac')
2235 check('\U0001d120', br'\U0001d120')
2236
2237 def test_escape_decode(self):
2238 decode = codecs.unicode_escape_decode
2239 check = coding_checker(self, decode)
2240 check(b"[\\\n]", "[]")
2241 check(br'[\"]', '["]')
2242 check(br"[\']", "[']")
2243 check(br"[\\]", r"[\]")
2244 check(br"[\a]", "[\x07]")
2245 check(br"[\b]", "[\x08]")
2246 check(br"[\t]", "[\x09]")
2247 check(br"[\n]", "[\x0a]")
2248 check(br"[\v]", "[\x0b]")
2249 check(br"[\f]", "[\x0c]")
2250 check(br"[\r]", "[\x0d]")
2251 check(br"[\7]", "[\x07]")
2252 check(br"[\8]", r"[\8]")
2253 check(br"[\78]", "[\x078]")
2254 check(br"[\41]", "[!]")
2255 check(br"[\418]", "[!8]")
2256 check(br"[\101]", "[A]")
2257 check(br"[\1010]", "[A0]")
2258 check(br"[\x41]", "[A]")
2259 check(br"[\x410]", "[A0]")
2260 check(br"\u20ac", "\u20ac")
2261 check(br"\U0001d120", "\U0001d120")
2262 for b in range(256):
2263 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2264 check(b'\\' + bytes([b]), '\\' + chr(b))
2265
2266 def test_decode_errors(self):
2267 decode = codecs.unicode_escape_decode
2268 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2269 for i in range(d):
2270 self.assertRaises(UnicodeDecodeError, decode,
2271 b"\\" + c + b"0"*i)
2272 self.assertRaises(UnicodeDecodeError, decode,
2273 b"[\\" + c + b"0"*i + b"]")
2274 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2275 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2276 self.assertEqual(decode(data, "replace"),
2277 ("[\ufffd]\ufffd", len(data)))
2278 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2279 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2280 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2281
2282
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002283class RawUnicodeEscapeTest(unittest.TestCase):
2284 def test_empty(self):
2285 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2286 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2287
2288 def test_raw_encode(self):
2289 encode = codecs.raw_unicode_escape_encode
2290 for b in range(256):
2291 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2292
2293 def test_raw_decode(self):
2294 decode = codecs.raw_unicode_escape_decode
2295 for b in range(256):
2296 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2297
2298 def test_escape_encode(self):
2299 encode = codecs.raw_unicode_escape_encode
2300 check = coding_checker(self, encode)
2301 for b in range(256):
2302 if b not in b'uU':
2303 check('\\' + chr(b), b'\\' + bytes([b]))
2304 check('\u20ac', br'\u20ac')
2305 check('\U0001d120', br'\U0001d120')
2306
2307 def test_escape_decode(self):
2308 decode = codecs.raw_unicode_escape_decode
2309 check = coding_checker(self, decode)
2310 for b in range(256):
2311 if b not in b'uU':
2312 check(b'\\' + bytes([b]), '\\' + chr(b))
2313 check(br"\u20ac", "\u20ac")
2314 check(br"\U0001d120", "\U0001d120")
2315
2316 def test_decode_errors(self):
2317 decode = codecs.raw_unicode_escape_decode
2318 for c, d in (b'u', 4), (b'U', 4):
2319 for i in range(d):
2320 self.assertRaises(UnicodeDecodeError, decode,
2321 b"\\" + c + b"0"*i)
2322 self.assertRaises(UnicodeDecodeError, decode,
2323 b"[\\" + c + b"0"*i + b"]")
2324 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2325 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2326 self.assertEqual(decode(data, "replace"),
2327 ("[\ufffd]\ufffd", len(data)))
2328 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2329 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2330 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2331
2332
Martin v. Löwis43c57782009-05-10 08:15:24 +00002333class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002334
2335 def test_utf8(self):
2336 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002337 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002338 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002339 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002340 b"foo\x80bar")
2341 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002342 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002343 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002344 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002345 b"\xed\xb0\x80")
2346
2347 def test_ascii(self):
2348 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002349 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002350 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002351 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002352 b"foo\x80bar")
2353
2354 def test_charmap(self):
2355 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002356 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002357 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002358 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002359 b"foo\xa5bar")
2360
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002361 def test_latin1(self):
2362 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002363 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002364 b"\xe4\xeb\xef\xf6\xfc")
2365
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002366
Victor Stinner3fed0872010-05-22 02:16:27 +00002367class BomTest(unittest.TestCase):
2368 def test_seek0(self):
2369 data = "1234567890"
2370 tests = ("utf-16",
2371 "utf-16-le",
2372 "utf-16-be",
2373 "utf-32",
2374 "utf-32-le",
2375 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002376 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002377 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002378 # Check if the BOM is written only once
2379 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002380 f.write(data)
2381 f.write(data)
2382 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002383 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002384 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002385 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002386
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002387 # Check that the BOM is written after a seek(0)
2388 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2389 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002390 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002391 f.seek(0)
2392 f.write(data)
2393 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002394 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002395
2396 # (StreamWriter) Check that the BOM is written after a seek(0)
2397 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002398 f.writer.write(data[0])
2399 self.assertNotEqual(f.writer.tell(), 0)
2400 f.writer.seek(0)
2401 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002402 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002403 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002404
Victor Stinner05010702011-05-27 16:50:40 +02002405 # Check that the BOM is not written after a seek() at a position
2406 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002407 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2408 f.write(data)
2409 f.seek(f.tell())
2410 f.write(data)
2411 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002412 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002413
Victor Stinner05010702011-05-27 16:50:40 +02002414 # (StreamWriter) Check that the BOM is not written after a seek()
2415 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002416 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002417 f.writer.write(data)
2418 f.writer.seek(f.writer.tell())
2419 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002420 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002421 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002422
Victor Stinner3fed0872010-05-22 02:16:27 +00002423
Georg Brandl02524622010-12-02 18:06:51 +00002424bytes_transform_encodings = [
2425 "base64_codec",
2426 "uu_codec",
2427 "quopri_codec",
2428 "hex_codec",
2429]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002430
2431transform_aliases = {
2432 "base64_codec": ["base64", "base_64"],
2433 "uu_codec": ["uu"],
2434 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2435 "hex_codec": ["hex"],
2436 "rot_13": ["rot13"],
2437}
2438
Georg Brandl02524622010-12-02 18:06:51 +00002439try:
2440 import zlib
2441except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002442 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002443else:
2444 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002445 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002446try:
2447 import bz2
2448except ImportError:
2449 pass
2450else:
2451 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002452 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002453
2454class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002455
Georg Brandl02524622010-12-02 18:06:51 +00002456 def test_basics(self):
2457 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002458 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002459 with self.subTest(encoding=encoding):
2460 # generic codecs interface
2461 (o, size) = codecs.getencoder(encoding)(binput)
2462 self.assertEqual(size, len(binput))
2463 (i, size) = codecs.getdecoder(encoding)(o)
2464 self.assertEqual(size, len(o))
2465 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002466
Georg Brandl02524622010-12-02 18:06:51 +00002467 def test_read(self):
2468 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002469 with self.subTest(encoding=encoding):
2470 sin = codecs.encode(b"\x80", encoding)
2471 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2472 sout = reader.read()
2473 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002474
2475 def test_readline(self):
2476 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002477 with self.subTest(encoding=encoding):
2478 sin = codecs.encode(b"\x80", encoding)
2479 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2480 sout = reader.readline()
2481 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002482
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002483 def test_buffer_api_usage(self):
2484 # We check all the transform codecs accept memoryview input
2485 # for encoding and decoding
2486 # and also that they roundtrip correctly
2487 original = b"12345\x80"
2488 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002489 with self.subTest(encoding=encoding):
2490 data = original
2491 view = memoryview(data)
2492 data = codecs.encode(data, encoding)
2493 view_encoded = codecs.encode(view, encoding)
2494 self.assertEqual(view_encoded, data)
2495 view = memoryview(data)
2496 data = codecs.decode(data, encoding)
2497 self.assertEqual(data, original)
2498 view_decoded = codecs.decode(view, encoding)
2499 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002500
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002501 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002502 # Check binary -> binary codecs give a good error for str input
2503 bad_input = "bad input type"
2504 for encoding in bytes_transform_encodings:
2505 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002506 fmt = ( "{!r} is not a text encoding; "
2507 "use codecs.encode\(\) to handle arbitrary codecs")
2508 msg = fmt.format(encoding)
2509 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002510 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002511 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002512
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002513 def test_text_to_binary_blacklists_text_transforms(self):
2514 # Check str.encode gives a good error message for str -> str codecs
2515 msg = (r"^'rot_13' is not a text encoding; "
2516 "use codecs.encode\(\) to handle arbitrary codecs")
2517 with self.assertRaisesRegex(LookupError, msg):
2518 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002519
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002520 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002521 # Check bytes.decode and bytearray.decode give a good error
2522 # message for binary -> binary codecs
2523 data = b"encode first to ensure we meet any format restrictions"
2524 for encoding in bytes_transform_encodings:
2525 with self.subTest(encoding=encoding):
2526 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002527 fmt = (r"{!r} is not a text encoding; "
2528 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002529 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002530 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002531 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002532 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002533 bytearray(encoded_data).decode(encoding)
2534
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002535 def test_binary_to_text_blacklists_text_transforms(self):
2536 # Check str -> str codec gives a good error for binary input
2537 for bad_input in (b"immutable", bytearray(b"mutable")):
2538 with self.subTest(bad_input=bad_input):
2539 msg = (r"^'rot_13' is not a text encoding; "
2540 "use codecs.decode\(\) to handle arbitrary codecs")
2541 with self.assertRaisesRegex(LookupError, msg) as failure:
2542 bad_input.decode("rot_13")
2543 self.assertIsNone(failure.exception.__cause__)
2544
Zachary Wareefa2e042013-12-30 14:54:11 -06002545 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002546 def test_custom_zlib_error_is_wrapped(self):
2547 # Check zlib codec gives a good error for malformed input
2548 msg = "^decoding with 'zlib_codec' codec failed"
2549 with self.assertRaisesRegex(Exception, msg) as failure:
2550 codecs.decode(b"hello", "zlib_codec")
2551 self.assertIsInstance(failure.exception.__cause__,
2552 type(failure.exception))
2553
2554 def test_custom_hex_error_is_wrapped(self):
2555 # Check hex codec gives a good error for malformed input
2556 msg = "^decoding with 'hex_codec' codec failed"
2557 with self.assertRaisesRegex(Exception, msg) as failure:
2558 codecs.decode(b"hello", "hex_codec")
2559 self.assertIsInstance(failure.exception.__cause__,
2560 type(failure.exception))
2561
2562 # Unfortunately, the bz2 module throws OSError, which the codec
2563 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002564
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002565 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2566 def test_aliases(self):
2567 for codec_name, aliases in transform_aliases.items():
2568 expected_name = codecs.lookup(codec_name).name
2569 for alias in aliases:
2570 with self.subTest(alias=alias):
2571 info = codecs.lookup(alias)
2572 self.assertEqual(info.name, expected_name)
2573
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002574 def test_uu_invalid(self):
2575 # Missing "begin" line
2576 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2577
Nick Coghlan8b097b42013-11-13 23:49:21 +10002578
2579# The codec system tries to wrap exceptions in order to ensure the error
2580# mentions the operation being performed and the codec involved. We
2581# currently *only* want this to happen for relatively stateless
2582# exceptions, where the only significant information they contain is their
2583# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002584
2585# Use a local codec registry to avoid appearing to leak objects when
2586# registering multiple seach functions
2587_TEST_CODECS = {}
2588
2589def _get_test_codec(codec_name):
2590 return _TEST_CODECS.get(codec_name)
2591codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2592
Nick Coghlan8fad1672014-09-15 23:50:44 +12002593try:
2594 # Issue #22166: Also need to clear the internal cache in CPython
2595 from _codecs import _forget_codec
2596except ImportError:
2597 def _forget_codec(codec_name):
2598 pass
2599
2600
Nick Coghlan8b097b42013-11-13 23:49:21 +10002601class ExceptionChainingTest(unittest.TestCase):
2602
2603 def setUp(self):
2604 # There's no way to unregister a codec search function, so we just
2605 # ensure we render this one fairly harmless after the test
2606 # case finishes by using the test case repr as the codec name
2607 # The codecs module normalizes codec names, although this doesn't
2608 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002609 # We also make sure we use a truly unique id for the custom codec
2610 # to avoid issues with the codec cache when running these tests
2611 # multiple times (e.g. when hunting for refleaks)
2612 unique_id = repr(self) + str(id(self))
2613 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2614
2615 # We store the object to raise on the instance because of a bad
2616 # interaction between the codec caching (which means we can't
2617 # recreate the codec entry) and regrtest refleak hunting (which
2618 # runs the same test instance multiple times). This means we
2619 # need to ensure the codecs call back in to the instance to find
2620 # out which exception to raise rather than binding them in a
2621 # closure to an object that may change on the next run
2622 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002623
Nick Coghlan4e553e22013-11-16 00:35:34 +10002624 def tearDown(self):
2625 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002626 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2627 encodings._cache.pop(self.codec_name, None)
2628 try:
2629 _forget_codec(self.codec_name)
2630 except KeyError:
2631 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002632
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002633 def set_codec(self, encode, decode):
2634 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002635 name=self.codec_name)
2636 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002637
2638 @contextlib.contextmanager
2639 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002640 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002641 operation, self.codec_name, exc_type.__name__, msg)
2642 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2643 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002644 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002645 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002646
2647 def raise_obj(self, *args, **kwds):
2648 # Helper to dynamically change the object raised by a test codec
2649 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002650
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002651 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002652 self.obj_to_raise = obj_to_raise
2653 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002654 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002655 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002656 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002657 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002658 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002659 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002660 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002661 codecs.decode(b"bytes input", self.codec_name)
2662
2663 def test_raise_by_type(self):
2664 self.check_wrapped(RuntimeError, "")
2665
2666 def test_raise_by_value(self):
2667 msg = "This should be wrapped"
2668 self.check_wrapped(RuntimeError(msg), msg)
2669
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002670 def test_raise_grandchild_subclass_exact_size(self):
2671 msg = "This should be wrapped"
2672 class MyRuntimeError(RuntimeError):
2673 __slots__ = ()
2674 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2675
2676 def test_raise_subclass_with_weakref_support(self):
2677 msg = "This should be wrapped"
2678 class MyRuntimeError(RuntimeError):
2679 pass
2680 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2681
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002682 def check_not_wrapped(self, obj_to_raise, msg):
2683 def raise_obj(*args, **kwds):
2684 raise obj_to_raise
2685 self.set_codec(raise_obj, raise_obj)
2686 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002687 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002688 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002689 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002690 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002691 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002692 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002693 codecs.decode(b"bytes input", self.codec_name)
2694
2695 def test_init_override_is_not_wrapped(self):
2696 class CustomInit(RuntimeError):
2697 def __init__(self):
2698 pass
2699 self.check_not_wrapped(CustomInit, "")
2700
2701 def test_new_override_is_not_wrapped(self):
2702 class CustomNew(RuntimeError):
2703 def __new__(cls):
2704 return super().__new__(cls)
2705 self.check_not_wrapped(CustomNew, "")
2706
2707 def test_instance_attribute_is_not_wrapped(self):
2708 msg = "This should NOT be wrapped"
2709 exc = RuntimeError(msg)
2710 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002711 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002712
2713 def test_non_str_arg_is_not_wrapped(self):
2714 self.check_not_wrapped(RuntimeError(1), "1")
2715
2716 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002717 msg_re = r"^\('a', 'b', 'c'\)$"
2718 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002719
2720 # http://bugs.python.org/issue19609
2721 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002722 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002723 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002724 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002725 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002726 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002727 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002728 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002729 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002730 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002731 codecs.decode(b"bytes input", self.codec_name)
2732
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002733 def test_unflagged_non_text_codec_handling(self):
2734 # The stdlib non-text codecs are now marked so they're
2735 # pre-emptively skipped by the text model related methods
2736 # However, third party codecs won't be flagged, so we still make
2737 # sure the case where an inappropriate output type is produced is
2738 # handled appropriately
2739 def encode_to_str(*args, **kwds):
2740 return "not bytes!", 0
2741 def decode_to_bytes(*args, **kwds):
2742 return b"not str!", 0
2743 self.set_codec(encode_to_str, decode_to_bytes)
2744 # No input or output type checks on the codecs module functions
2745 encoded = codecs.encode(None, self.codec_name)
2746 self.assertEqual(encoded, "not bytes!")
2747 decoded = codecs.decode(None, self.codec_name)
2748 self.assertEqual(decoded, b"not str!")
2749 # Text model methods should complain
2750 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2751 "use codecs.encode\(\) to encode to arbitrary types$")
2752 msg = fmt.format(self.codec_name)
2753 with self.assertRaisesRegex(TypeError, msg):
2754 "str_input".encode(self.codec_name)
2755 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2756 "use codecs.decode\(\) to decode to arbitrary types$")
2757 msg = fmt.format(self.codec_name)
2758 with self.assertRaisesRegex(TypeError, msg):
2759 b"bytes input".decode(self.codec_name)
2760
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002761
Georg Brandl02524622010-12-02 18:06:51 +00002762
Victor Stinner62be4fb2011-10-18 21:46:37 +02002763@unittest.skipUnless(sys.platform == 'win32',
2764 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002765class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002766 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002767 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002768
Victor Stinner3a50e702011-10-18 21:21:00 +02002769 def test_invalid_code_page(self):
2770 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2771 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002772 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2773 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002774
2775 def test_code_page_name(self):
2776 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2777 codecs.code_page_encode, 932, '\xff')
2778 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002779 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002780 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002781 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002782
2783 def check_decode(self, cp, tests):
2784 for raw, errors, expected in tests:
2785 if expected is not None:
2786 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002787 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002788 except UnicodeDecodeError as err:
2789 self.fail('Unable to decode %a from "cp%s" with '
2790 'errors=%r: %s' % (raw, cp, errors, err))
2791 self.assertEqual(decoded[0], expected,
2792 '%a.decode("cp%s", %r)=%a != %a'
2793 % (raw, cp, errors, decoded[0], expected))
2794 # assert 0 <= decoded[1] <= len(raw)
2795 self.assertGreaterEqual(decoded[1], 0)
2796 self.assertLessEqual(decoded[1], len(raw))
2797 else:
2798 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002799 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002800
2801 def check_encode(self, cp, tests):
2802 for text, errors, expected in tests:
2803 if expected is not None:
2804 try:
2805 encoded = codecs.code_page_encode(cp, text, errors)
2806 except UnicodeEncodeError as err:
2807 self.fail('Unable to encode %a to "cp%s" with '
2808 'errors=%r: %s' % (text, cp, errors, err))
2809 self.assertEqual(encoded[0], expected,
2810 '%a.encode("cp%s", %r)=%a != %a'
2811 % (text, cp, errors, encoded[0], expected))
2812 self.assertEqual(encoded[1], len(text))
2813 else:
2814 self.assertRaises(UnicodeEncodeError,
2815 codecs.code_page_encode, cp, text, errors)
2816
2817 def test_cp932(self):
2818 self.check_encode(932, (
2819 ('abc', 'strict', b'abc'),
2820 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002821 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002822 ('\xff', 'strict', None),
2823 ('[\xff]', 'ignore', b'[]'),
2824 ('[\xff]', 'replace', b'[y]'),
2825 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002826 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2827 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002828 ('\udcff', 'strict', None),
2829 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2830 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002831 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002832 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002833 (b'abc', 'strict', 'abc'),
2834 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2835 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002836 (b'[\xff]', 'strict', None),
2837 (b'[\xff]', 'ignore', '[]'),
2838 (b'[\xff]', 'replace', '[\ufffd]'),
2839 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002840 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002841 (b'\x81\x00abc', 'strict', None),
2842 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002843 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2844 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002845
2846 def test_cp1252(self):
2847 self.check_encode(1252, (
2848 ('abc', 'strict', b'abc'),
2849 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2850 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002851 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002852 ('\u0141', 'strict', None),
2853 ('\u0141', 'ignore', b''),
2854 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002855 ('\udc98', 'surrogateescape', b'\x98'),
2856 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002857 ))
2858 self.check_decode(1252, (
2859 (b'abc', 'strict', 'abc'),
2860 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2861 (b'\xff', 'strict', '\xff'),
2862 ))
2863
2864 def test_cp_utf7(self):
2865 cp = 65000
2866 self.check_encode(cp, (
2867 ('abc', 'strict', b'abc'),
2868 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2869 ('\U0010ffff', 'strict', b'+2//f/w-'),
2870 ('\udc80', 'strict', b'+3IA-'),
2871 ('\ufffd', 'strict', b'+//0-'),
2872 ))
2873 self.check_decode(cp, (
2874 (b'abc', 'strict', 'abc'),
2875 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2876 (b'+2//f/w-', 'strict', '\U0010ffff'),
2877 (b'+3IA-', 'strict', '\udc80'),
2878 (b'+//0-', 'strict', '\ufffd'),
2879 # invalid bytes
2880 (b'[+/]', 'strict', '[]'),
2881 (b'[\xff]', 'strict', '[\xff]'),
2882 ))
2883
Victor Stinner3a50e702011-10-18 21:21:00 +02002884 def test_multibyte_encoding(self):
2885 self.check_decode(932, (
2886 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2887 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2888 ))
2889 self.check_decode(self.CP_UTF8, (
2890 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2891 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2892 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002893 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002894 self.check_encode(self.CP_UTF8, (
2895 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2896 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2897 ))
2898
2899 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002900 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2901 self.assertEqual(decoded, ('', 0))
2902
Victor Stinner3a50e702011-10-18 21:21:00 +02002903 decoded = codecs.code_page_decode(932,
2904 b'\xe9\x80\xe9', 'strict',
2905 False)
2906 self.assertEqual(decoded, ('\u9a3e', 2))
2907
2908 decoded = codecs.code_page_decode(932,
2909 b'\xe9\x80\xe9\x80', 'strict',
2910 False)
2911 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2912
2913 decoded = codecs.code_page_decode(932,
2914 b'abc', 'strict',
2915 False)
2916 self.assertEqual(decoded, ('abc', 3))
2917
2918
Fred Drake2e2be372001-09-20 21:33:42 +00002919if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002920 unittest.main()