blob: 8a6e0f91db0c3606e365d5c3f0d36e5690835492 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Nick Coghlan96252cd2014-02-07 23:34:41 +1000127 # Temporary skip, see http://bugs.python.org/issue20542
128 @unittest.skip
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129 def test_readline(self):
130 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000131 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 return codecs.getreader(self.encoding)(stream)
133
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200135 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 lines = []
137 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000138 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139 if not line:
140 break
141 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000142 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
145 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
146 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000147 self.assertEqual(readalllines(s, True), sexpected)
148 self.assertEqual(readalllines(s, False), sexpectednoends)
149 self.assertEqual(readalllines(s, True, 10), sexpected)
150 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200152 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000153 # Test long lines (multiple calls to read() in readline())
154 vw = []
155 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200156 for (i, lineend) in enumerate(lineends):
157 vw.append((i*200+200)*"\u3042" + lineend)
158 vwo.append((i*200+200)*"\u3042")
159 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
160 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000161
162 # Test lines where the first read might end with \r, so the
163 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200165 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000166 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000168 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 self.assertEqual(
170 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000171 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200173 self.assertEqual(
174 reader.readline(keepends=True),
175 "xxx\n",
176 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000178 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000179 self.assertEqual(
180 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000181 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000182 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200183 self.assertEqual(
184 reader.readline(keepends=False),
185 "xxx",
186 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000187
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200188 def test_mixed_readline_and_read(self):
189 lines = ["Humpty Dumpty sat on a wall,\n",
190 "Humpty Dumpty had a great fall.\r\n",
191 "All the king's horses and all the king's men\r",
192 "Couldn't put Humpty together again."]
193 data = ''.join(lines)
194 def getreader():
195 stream = io.BytesIO(data.encode(self.encoding))
196 return codecs.getreader(self.encoding)(stream)
197
198 # Issue #8260: Test readline() followed by read()
199 f = getreader()
200 self.assertEqual(f.readline(), lines[0])
201 self.assertEqual(f.read(), ''.join(lines[1:]))
202 self.assertEqual(f.read(), '')
203
204 # Issue #16636: Test readline() followed by readlines()
205 f = getreader()
206 self.assertEqual(f.readline(), lines[0])
207 self.assertEqual(f.readlines(), lines[1:])
208 self.assertEqual(f.read(), '')
209
210 # Test read() followed by read()
211 f = getreader()
212 self.assertEqual(f.read(size=40, chars=5), data[:5])
213 self.assertEqual(f.read(), data[5:])
214 self.assertEqual(f.read(), '')
215
216 # Issue #12446: Test read() followed by readlines()
217 f = getreader()
218 self.assertEqual(f.read(size=40, chars=5), data[:5])
219 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
220 self.assertEqual(f.read(), '')
221
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000222 def test_bug1175396(self):
223 s = [
224 '<%!--===================================================\r\n',
225 ' BLOG index page: show recent articles,\r\n',
226 ' today\'s articles, or articles of a specific date.\r\n',
227 '========================================================--%>\r\n',
228 '<%@inputencoding="ISO-8859-1"%>\r\n',
229 '<%@pagetemplate=TEMPLATE.y%>\r\n',
230 '<%@import=import frog.util, frog%>\r\n',
231 '<%@import=import frog.objects%>\r\n',
232 '<%@import=from frog.storageerrors import StorageError%>\r\n',
233 '<%\r\n',
234 '\r\n',
235 'import logging\r\n',
236 'log=logging.getLogger("Snakelets.logger")\r\n',
237 '\r\n',
238 '\r\n',
239 'user=self.SessionCtx.user\r\n',
240 'storageEngine=self.SessionCtx.storageEngine\r\n',
241 '\r\n',
242 '\r\n',
243 'def readArticlesFromDate(date, count=None):\r\n',
244 ' entryids=storageEngine.listBlogEntries(date)\r\n',
245 ' entryids.reverse() # descending\r\n',
246 ' if count:\r\n',
247 ' entryids=entryids[:count]\r\n',
248 ' try:\r\n',
249 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
250 ' except StorageError,x:\r\n',
251 ' log.error("Error loading articles: "+str(x))\r\n',
252 ' self.abort("cannot load articles")\r\n',
253 '\r\n',
254 'showdate=None\r\n',
255 '\r\n',
256 'arg=self.Request.getArg()\r\n',
257 'if arg=="today":\r\n',
258 ' #-------------------- TODAY\'S ARTICLES\r\n',
259 ' self.write("<h2>Today\'s articles</h2>")\r\n',
260 ' showdate = frog.util.isodatestr() \r\n',
261 ' entries = readArticlesFromDate(showdate)\r\n',
262 'elif arg=="active":\r\n',
263 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
264 ' self.Yredirect("active.y")\r\n',
265 'elif arg=="login":\r\n',
266 ' #-------------------- LOGIN PAGE redirect\r\n',
267 ' self.Yredirect("login.y")\r\n',
268 'elif arg=="date":\r\n',
269 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
270 ' showdate = self.Request.getParameter("date")\r\n',
271 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
272 ' entries = readArticlesFromDate(showdate)\r\n',
273 'else:\r\n',
274 ' #-------------------- RECENT ARTICLES\r\n',
275 ' self.write("<h2>Recent articles</h2>")\r\n',
276 ' dates=storageEngine.listBlogEntryDates()\r\n',
277 ' if dates:\r\n',
278 ' entries=[]\r\n',
279 ' SHOWAMOUNT=10\r\n',
280 ' for showdate in dates:\r\n',
281 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
282 ' if len(entries)>=SHOWAMOUNT:\r\n',
283 ' break\r\n',
284 ' \r\n',
285 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000286 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200287 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000288 for (i, line) in enumerate(reader):
289 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000290
291 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000292 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200293 writer = codecs.getwriter(self.encoding)(q)
294 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000295
296 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000297 writer.write("foo\r")
298 self.assertEqual(reader.readline(keepends=False), "foo")
299 writer.write("\nbar\r")
300 self.assertEqual(reader.readline(keepends=False), "")
301 self.assertEqual(reader.readline(keepends=False), "bar")
302 writer.write("baz")
303 self.assertEqual(reader.readline(keepends=False), "baz")
304 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305
306 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000307 writer.write("foo\r")
308 self.assertEqual(reader.readline(keepends=True), "foo\r")
309 writer.write("\nbar\r")
310 self.assertEqual(reader.readline(keepends=True), "\n")
311 self.assertEqual(reader.readline(keepends=True), "bar\r")
312 writer.write("baz")
313 self.assertEqual(reader.readline(keepends=True), "baz")
314 self.assertEqual(reader.readline(keepends=True), "")
315 writer.write("foo\r\n")
316 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000317
Walter Dörwald9fa09462005-01-10 12:01:39 +0000318 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000319 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
320 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
321 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000322
323 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000324 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200325 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000326 self.assertEqual(reader.readline(), s1)
327 self.assertEqual(reader.readline(), s2)
328 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000329 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000330
331 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000332 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
333 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
334 s3 = "stillokay:bbbbxx\r\n"
335 s4 = "broken!!!!badbad\r\n"
336 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337
338 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000339 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200340 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000341 self.assertEqual(reader.readline(), s1)
342 self.assertEqual(reader.readline(), s2)
343 self.assertEqual(reader.readline(), s3)
344 self.assertEqual(reader.readline(), s4)
345 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000346 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000347
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200348 ill_formed_sequence_replace = "\ufffd"
349
350 def test_lone_surrogates(self):
351 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
352 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
353 "[\\udc80]".encode(self.encoding))
354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
381
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200382class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000383 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200384 if sys.byteorder == 'little':
385 ill_formed_sequence = b"\x80\xdc\x00\x00"
386 else:
387 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000388
389 spamle = (b'\xff\xfe\x00\x00'
390 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
391 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
392 spambe = (b'\x00\x00\xfe\xff'
393 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
394 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
395
396 def test_only_one_bom(self):
397 _,_,reader,writer = codecs.lookup(self.encoding)
398 # encode some stream
399 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200400 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000401 f.write("spam")
402 f.write("spam")
403 d = s.getvalue()
404 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000405 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406 # try to read it back
407 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200408 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000409 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410
411 def test_badbom(self):
412 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200413 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 self.assertRaises(UnicodeError, f.read)
415
416 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 def test_partial(self):
421 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200422 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000423 [
424 "", # first byte of BOM read
425 "", # second byte of BOM read
426 "", # third byte of BOM read
427 "", # fourth byte of BOM read => byteorder known
428 "",
429 "",
430 "",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff",
446 "\x00\xff\u0100\uffff",
447 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000448 ]
449 )
450
Georg Brandl791f4e12009-09-17 11:41:24 +0000451 def test_handlers(self):
452 self.assertEqual(('\ufffd', 1),
453 codecs.utf_32_decode(b'\x01', 'replace', True))
454 self.assertEqual(('', 1),
455 codecs.utf_32_decode(b'\x01', 'ignore', True))
456
Walter Dörwald41980ca2007-08-16 21:55:45 +0000457 def test_errors(self):
458 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
459 b"\xff", "strict", True)
460
461 def test_decoder_state(self):
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spamle)
464 self.check_state_handling_decode(self.encoding,
465 "spamspam", self.spambe)
466
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000467 def test_issue8941(self):
468 # Issue #8941: insufficient result allocation when decoding into
469 # surrogate pairs on UCS-2 builds.
470 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
471 self.assertEqual('\U00010000' * 1024,
472 codecs.utf_32_decode(encoded_le)[0])
473 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
474 self.assertEqual('\U00010000' * 1024,
475 codecs.utf_32_decode(encoded_be)[0])
476
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200477class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200479 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480
481 def test_partial(self):
482 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200483 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484 [
485 "",
486 "",
487 "",
488 "\x00",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000505 ]
506 )
507
508 def test_simple(self):
509 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
510
511 def test_errors(self):
512 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
513 b"\xff", "strict", True)
514
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000515 def test_issue8941(self):
516 # Issue #8941: insufficient result allocation when decoding into
517 # surrogate pairs on UCS-2 builds.
518 encoded = b'\x00\x00\x01\x00' * 1024
519 self.assertEqual('\U00010000' * 1024,
520 codecs.utf_32_le_decode(encoded)[0])
521
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200522class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200524 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000525
526 def test_partial(self):
527 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200528 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529 [
530 "",
531 "",
532 "",
533 "\x00",
534 "\x00",
535 "\x00",
536 "\x00",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff",
540 "\x00\xff",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100",
544 "\x00\xff\u0100",
545 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff",
548 "\x00\xff\u0100\uffff",
549 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000550 ]
551 )
552
553 def test_simple(self):
554 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
555
556 def test_errors(self):
557 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
558 b"\xff", "strict", True)
559
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000560 def test_issue8941(self):
561 # Issue #8941: insufficient result allocation when decoding into
562 # surrogate pairs on UCS-2 builds.
563 encoded = b'\x00\x01\x00\x00' * 1024
564 self.assertEqual('\U00010000' * 1024,
565 codecs.utf_32_be_decode(encoded)[0])
566
567
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200568class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200570 if sys.byteorder == 'little':
571 ill_formed_sequence = b"\x80\xdc"
572 else:
573 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000574
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000575 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
576 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000577
578 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000580 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000581 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200582 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000583 f.write("spam")
584 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000585 d = s.getvalue()
586 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000587 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000588 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000589 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200590 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000591 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000593 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000594 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200595 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000596 self.assertRaises(UnicodeError, f.read)
597
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwald69652032004-09-07 20:24:22 +0000602 def test_partial(self):
603 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200604 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000605 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000606 "", # first byte of BOM read
607 "", # second byte of BOM read => byteorder known
608 "",
609 "\x00",
610 "\x00",
611 "\x00\xff",
612 "\x00\xff",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100",
615 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff",
619 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 ]
621 )
622
Georg Brandl791f4e12009-09-17 11:41:24 +0000623 def test_handlers(self):
624 self.assertEqual(('\ufffd', 1),
625 codecs.utf_16_decode(b'\x01', 'replace', True))
626 self.assertEqual(('', 1),
627 codecs.utf_16_decode(b'\x01', 'ignore', True))
628
Walter Dörwalde22d3392005-11-17 08:52:34 +0000629 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000631 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000632
633 def test_decoder_state(self):
634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000637 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000638
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000639 def test_bug691291(self):
640 # Files are always opened in binary mode, even if no binary mode was
641 # specified. This means that no automatic conversion of '\n' is done
642 # on reading and writing.
643 s1 = 'Hello\r\nworld\r\n'
644
645 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200646 self.addCleanup(support.unlink, support.TESTFN)
647 with open(support.TESTFN, 'wb') as fp:
648 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200649 with support.check_warnings(('', DeprecationWarning)):
650 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
651 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200652 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000653
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200654class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000655 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200656 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000657
658 def test_partial(self):
659 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200660 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "\x00",
664 "\x00",
665 "\x00\xff",
666 "\x00\xff",
667 "\x00\xff\u0100",
668 "\x00\xff\u0100",
669 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff",
672 "\x00\xff\u0100\uffff",
673 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000674 ]
675 )
676
Walter Dörwalde22d3392005-11-17 08:52:34 +0000677 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200678 tests = [
679 (b'\xff', '\ufffd'),
680 (b'A\x00Z', 'A\ufffd'),
681 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
682 (b'\x00\xd8', '\ufffd'),
683 (b'\x00\xd8A', '\ufffd'),
684 (b'\x00\xd8A\x00', '\ufffdA'),
685 (b'\x00\xdcA\x00', '\ufffdA'),
686 ]
687 for raw, expected in tests:
688 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
689 raw, 'strict', True)
690 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000691
Victor Stinner53a9dd72010-12-08 22:25:45 +0000692 def test_nonbmp(self):
693 self.assertEqual("\U00010203".encode(self.encoding),
694 b'\x00\xd8\x03\xde')
695 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
696 "\U00010203")
697
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200698class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000699 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200700 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000701
702 def test_partial(self):
703 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200704 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000705 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 "",
707 "\x00",
708 "\x00",
709 "\x00\xff",
710 "\x00\xff",
711 "\x00\xff\u0100",
712 "\x00\xff\u0100",
713 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff",
716 "\x00\xff\u0100\uffff",
717 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000718 ]
719 )
720
Walter Dörwalde22d3392005-11-17 08:52:34 +0000721 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200722 tests = [
723 (b'\xff', '\ufffd'),
724 (b'\x00A\xff', 'A\ufffd'),
725 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
726 (b'\xd8\x00', '\ufffd'),
727 (b'\xd8\x00\xdc', '\ufffd'),
728 (b'\xd8\x00\x00A', '\ufffdA'),
729 (b'\xdc\x00\x00A', '\ufffdA'),
730 ]
731 for raw, expected in tests:
732 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
733 raw, 'strict', True)
734 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000735
Victor Stinner53a9dd72010-12-08 22:25:45 +0000736 def test_nonbmp(self):
737 self.assertEqual("\U00010203".encode(self.encoding),
738 b'\xd8\x00\xde\x03')
739 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
740 "\U00010203")
741
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200742class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000743 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200744 ill_formed_sequence = b"\xed\xb2\x80"
745 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000746
747 def test_partial(self):
748 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200749 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000750 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000751 "\x00",
752 "\x00",
753 "\x00\xff",
754 "\x00\xff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff",
757 "\x00\xff\u07ff",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800",
760 "\x00\xff\u07ff\u0800",
761 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff",
764 "\x00\xff\u07ff\u0800\uffff",
765 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000766 ]
767 )
768
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000771 self.check_state_handling_decode(self.encoding,
772 u, u.encode(self.encoding))
773
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000774 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200775 super().test_lone_surrogates()
776 # not sure if this is making sense for
777 # UTF-16 and UTF-32
778 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000779 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000780
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000781 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000782 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
783 b"abc\xed\xa0\x80def")
784 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
785 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200786 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
787 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
788 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
789 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000790 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200793 with self.assertRaises(UnicodeDecodeError):
794 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000795
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200796@unittest.skipUnless(sys.platform == 'win32',
797 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200798class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200799 encoding = "cp65001"
800
801 def test_encode(self):
802 tests = [
803 ('abc', 'strict', b'abc'),
804 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
805 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
806 ]
807 if VISTA_OR_LATER:
808 tests.extend((
809 ('\udc80', 'strict', None),
810 ('\udc80', 'ignore', b''),
811 ('\udc80', 'replace', b'?'),
812 ('\udc80', 'backslashreplace', b'\\udc80'),
813 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
814 ))
815 else:
816 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
817 for text, errors, expected in tests:
818 if expected is not None:
819 try:
820 encoded = text.encode('cp65001', errors)
821 except UnicodeEncodeError as err:
822 self.fail('Unable to encode %a to cp65001 with '
823 'errors=%r: %s' % (text, errors, err))
824 self.assertEqual(encoded, expected,
825 '%a.encode("cp65001", %r)=%a != %a'
826 % (text, errors, encoded, expected))
827 else:
828 self.assertRaises(UnicodeEncodeError,
829 text.encode, "cp65001", errors)
830
831 def test_decode(self):
832 tests = [
833 (b'abc', 'strict', 'abc'),
834 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
835 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
836 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
837 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
838 # invalid bytes
839 (b'[\xff]', 'strict', None),
840 (b'[\xff]', 'ignore', '[]'),
841 (b'[\xff]', 'replace', '[\ufffd]'),
842 (b'[\xff]', 'surrogateescape', '[\udcff]'),
843 ]
844 if VISTA_OR_LATER:
845 tests.extend((
846 (b'[\xed\xb2\x80]', 'strict', None),
847 (b'[\xed\xb2\x80]', 'ignore', '[]'),
848 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
849 ))
850 else:
851 tests.extend((
852 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
853 ))
854 for raw, errors, expected in tests:
855 if expected is not None:
856 try:
857 decoded = raw.decode('cp65001', errors)
858 except UnicodeDecodeError as err:
859 self.fail('Unable to decode %a from cp65001 with '
860 'errors=%r: %s' % (raw, errors, err))
861 self.assertEqual(decoded, expected,
862 '%a.decode("cp65001", %r)=%a != %a'
863 % (raw, errors, decoded, expected))
864 else:
865 self.assertRaises(UnicodeDecodeError,
866 raw.decode, 'cp65001', errors)
867
868 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
869 def test_lone_surrogates(self):
870 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
871 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
872 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
873 b'[\\udc80]')
874 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
875 b'[&#56448;]')
876 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
877 b'[\x80]')
878 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
879 b'[]')
880 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
881 b'[?]')
882
883 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
884 def test_surrogatepass_handler(self):
885 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
886 b"abc\xed\xa0\x80def")
887 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
888 "abc\ud800def")
889 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
890 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
891 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
892 "\U00010fff\uD800")
893 self.assertTrue(codecs.lookup_error("surrogatepass"))
894
895
896
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200897class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000898 encoding = "utf-7"
899
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000900 def test_partial(self):
901 self.check_partial(
902 "a+-b",
903 [
904 "a",
905 "a",
906 "a+",
907 "a+-",
908 "a+-b",
909 ]
910 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000911
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300912 def test_errors(self):
913 tests = [
914 (b'a\xffb', 'a\ufffdb'),
915 (b'a+IK', 'a\ufffd'),
916 (b'a+IK-b', 'a\ufffdb'),
917 (b'a+IK,b', 'a\ufffdb'),
918 (b'a+IKx', 'a\u20ac\ufffd'),
919 (b'a+IKx-b', 'a\u20ac\ufffdb'),
920 (b'a+IKwgr', 'a\u20ac\ufffd'),
921 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
922 (b'a+IKwgr,', 'a\u20ac\ufffd'),
923 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
924 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
925 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
926 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
927 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
928 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
929 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
930 ]
931 for raw, expected in tests:
932 with self.subTest(raw=raw):
933 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
934 raw, 'strict', True)
935 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
936
937 def test_nonbmp(self):
938 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
939 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
940 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
941
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200942 test_lone_surrogates = None
943
944
Walter Dörwalde22d3392005-11-17 08:52:34 +0000945class UTF16ExTest(unittest.TestCase):
946
947 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000948 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000949
950 def test_bad_args(self):
951 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
952
953class ReadBufferTest(unittest.TestCase):
954
955 def test_array(self):
956 import array
957 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000958 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000959 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000960 )
961
962 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000963 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000964
965 def test_bad_args(self):
966 self.assertRaises(TypeError, codecs.readbuffer_encode)
967 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
968
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200969class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000970 encoding = "utf-8-sig"
971
972 def test_partial(self):
973 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200974 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000975 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000976 "",
977 "",
978 "", # First BOM has been read and skipped
979 "",
980 "",
981 "\ufeff", # Second BOM has been read and emitted
982 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000983 "\ufeff\x00", # First byte of encoded "\xff" read
984 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
985 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
986 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000987 "\ufeff\x00\xff\u07ff",
988 "\ufeff\x00\xff\u07ff",
989 "\ufeff\x00\xff\u07ff\u0800",
990 "\ufeff\x00\xff\u07ff\u0800",
991 "\ufeff\x00\xff\u07ff\u0800",
992 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200993 "\ufeff\x00\xff\u07ff\u0800\uffff",
994 "\ufeff\x00\xff\u07ff\u0800\uffff",
995 "\ufeff\x00\xff\u07ff\u0800\uffff",
996 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000997 ]
998 )
999
Thomas Wouters89f507f2006-12-13 04:49:30 +00001000 def test_bug1601501(self):
1001 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001002 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001003
Walter Dörwald3abcb012007-04-16 22:10:50 +00001004 def test_bom(self):
1005 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001006 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001007 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1008
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001009 def test_stream_bom(self):
1010 unistring = "ABC\u00A1\u2200XYZ"
1011 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1012
1013 reader = codecs.getreader("utf-8-sig")
1014 for sizehint in [None] + list(range(1, 11)) + \
1015 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001016 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001017 ostream = io.StringIO()
1018 while 1:
1019 if sizehint is not None:
1020 data = istream.read(sizehint)
1021 else:
1022 data = istream.read()
1023
1024 if not data:
1025 break
1026 ostream.write(data)
1027
1028 got = ostream.getvalue()
1029 self.assertEqual(got, unistring)
1030
1031 def test_stream_bare(self):
1032 unistring = "ABC\u00A1\u2200XYZ"
1033 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1034
1035 reader = codecs.getreader("utf-8-sig")
1036 for sizehint in [None] + list(range(1, 11)) + \
1037 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001038 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001039 ostream = io.StringIO()
1040 while 1:
1041 if sizehint is not None:
1042 data = istream.read(sizehint)
1043 else:
1044 data = istream.read()
1045
1046 if not data:
1047 break
1048 ostream.write(data)
1049
1050 got = ostream.getvalue()
1051 self.assertEqual(got, unistring)
1052
1053class EscapeDecodeTest(unittest.TestCase):
1054 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001055 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001056
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001057 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001058 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001059 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001060 b = bytes([b])
1061 if b != b'\\':
1062 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001063
1064 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001065 decode = codecs.escape_decode
1066 check = coding_checker(self, decode)
1067 check(b"[\\\n]", b"[]")
1068 check(br'[\"]', b'["]')
1069 check(br"[\']", b"[']")
1070 check(br"[\\]", br"[\]")
1071 check(br"[\a]", b"[\x07]")
1072 check(br"[\b]", b"[\x08]")
1073 check(br"[\t]", b"[\x09]")
1074 check(br"[\n]", b"[\x0a]")
1075 check(br"[\v]", b"[\x0b]")
1076 check(br"[\f]", b"[\x0c]")
1077 check(br"[\r]", b"[\x0d]")
1078 check(br"[\7]", b"[\x07]")
1079 check(br"[\8]", br"[\8]")
1080 check(br"[\78]", b"[\x078]")
1081 check(br"[\41]", b"[!]")
1082 check(br"[\418]", b"[!8]")
1083 check(br"[\101]", b"[A]")
1084 check(br"[\1010]", b"[A0]")
1085 check(br"[\501]", b"[A]")
1086 check(br"[\x41]", b"[A]")
1087 check(br"[\X41]", br"[\X41]")
1088 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001089 for b in range(256):
1090 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001091 b = bytes([b])
1092 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001093
1094 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001095 decode = codecs.escape_decode
1096 self.assertRaises(ValueError, decode, br"\x")
1097 self.assertRaises(ValueError, decode, br"[\x]")
1098 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1099 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1100 self.assertRaises(ValueError, decode, br"\x0")
1101 self.assertRaises(ValueError, decode, br"[\x0]")
1102 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1103 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001104
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001105class RecodingTest(unittest.TestCase):
1106 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001107 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001108 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001109 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001110 f2.close()
1111 # Python used to crash on this at exit because of a refcount
1112 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001113
Martin v. Löwis2548c732003-04-18 10:39:54 +00001114# From RFC 3492
1115punycode_testcases = [
1116 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001117 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1118 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001119 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001121 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001122 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001124 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001125 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001126 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001127 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1128 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1129 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001130 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001132 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1133 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1134 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001135 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001136 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001137 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001138 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1139 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1140 "\u0939\u0948\u0902",
1141 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142
1143 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001144 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001145 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1146 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001147
1148 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1150 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1151 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001152 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1153 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154
1155 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1157 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1158 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1159 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001160 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161
1162 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1164 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1165 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1166 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1167 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001168 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169
1170 # (K) Vietnamese:
1171 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1172 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1174 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1175 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1176 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001177 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178
Martin v. Löwis2548c732003-04-18 10:39:54 +00001179 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001181 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001182
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001184 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1185 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1186 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001187 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001188
1189 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001190 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1191 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1192 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001193 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001194
1195 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001196 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001197 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001198
1199 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001200 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1201 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001202 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001203
1204 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001205 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001206 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207
1208 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001210 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001211
1212 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001213 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1214 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001215 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 ]
1217
1218for i in punycode_testcases:
1219 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001220 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001221
1222class PunycodeTest(unittest.TestCase):
1223 def test_encode(self):
1224 for uni, puny in punycode_testcases:
1225 # Need to convert both strings to lower case, since
1226 # some of the extended encodings use upper case, but our
1227 # code produces only lower case. Converting just puny to
1228 # lower is also insufficient, since some of the input characters
1229 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001230 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001231 str(uni.encode("punycode"), "ascii").lower(),
1232 str(puny, "ascii").lower()
1233 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234
1235 def test_decode(self):
1236 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001237 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001238 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001239 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001241class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001242 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001243 def test_bug1251300(self):
1244 # Decoding with unicode_internal used to not correctly handle "code
1245 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001246 ok = [
1247 (b"\x00\x10\xff\xff", "\U0010ffff"),
1248 (b"\x00\x00\x01\x01", "\U00000101"),
1249 (b"", ""),
1250 ]
1251 not_ok = [
1252 b"\x7f\xff\xff\xff",
1253 b"\x80\x00\x00\x00",
1254 b"\x81\x00\x00\x00",
1255 b"\x00",
1256 b"\x00\x00\x00\x00\x00",
1257 ]
1258 for internal, uni in ok:
1259 if sys.byteorder == "little":
1260 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001261 with support.check_warnings():
1262 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001263 for internal in not_ok:
1264 if sys.byteorder == "little":
1265 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001266 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001267 'deprecated', DeprecationWarning)):
1268 self.assertRaises(UnicodeDecodeError, internal.decode,
1269 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001270 if sys.byteorder == "little":
1271 invalid = b"\x00\x00\x11\x00"
1272 else:
1273 invalid = b"\x00\x11\x00\x00"
1274 with support.check_warnings():
1275 self.assertRaises(UnicodeDecodeError,
1276 invalid.decode, "unicode_internal")
1277 with support.check_warnings():
1278 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1279 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001280
Victor Stinner182d90d2011-09-29 19:53:55 +02001281 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001282 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001283 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001284 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001285 'deprecated', DeprecationWarning)):
1286 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001287 except UnicodeDecodeError as ex:
1288 self.assertEqual("unicode_internal", ex.encoding)
1289 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1290 self.assertEqual(4, ex.start)
1291 self.assertEqual(8, ex.end)
1292 else:
1293 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001294
Victor Stinner182d90d2011-09-29 19:53:55 +02001295 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001296 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001297 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1298 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001299 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001300 'deprecated', DeprecationWarning)):
1301 ab = "ab".encode("unicode_internal").decode()
1302 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1303 "ascii"),
1304 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001305 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001306
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001307 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001308 with support.check_warnings(('unicode_internal codec has been '
1309 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001310 # Issue 3739
1311 encoder = codecs.getencoder("unicode_internal")
1312 self.assertEqual(encoder("a")[1], 1)
1313 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1314
1315 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001316
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1318nameprep_tests = [
1319 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001320 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1321 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1322 b'\xb8\x8f\xef\xbb\xbf',
1323 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001324 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001325 (b'CAFE',
1326 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001327 # 3.3 Case folding 8bit U+00DF (german sharp s).
1328 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001329 (b'\xc3\x9f',
1330 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001331 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001332 (b'\xc4\xb0',
1333 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001335 (b'\xc5\x83\xcd\xba',
1336 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1338 # XXX: skip this as it fails in UCS-2 mode
1339 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1340 # 'telc\xe2\x88\x95kg\xcf\x83'),
1341 (None, None),
1342 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001343 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1344 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001345 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001346 (b'\xe1\xbe\xb7',
1347 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 # 3.9 Self-reverting case folding U+01F0 and normalization.
1349 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'\xc7\xb0',
1351 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'\xce\x90',
1354 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'\xce\xb0',
1357 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001359 (b'\xe1\xba\x96',
1360 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'\xe1\xbd\x96',
1363 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b' ',
1366 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001368 (b'\xc2\xa0',
1369 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372 None),
1373 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xe2\x80\x80',
1375 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xe2\x80\x8b',
1378 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xe3\x80\x80',
1381 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\x10\x7f',
1384 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001387 None),
1388 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001390 None),
1391 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xef\xbb\xbf',
1393 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 None),
1397 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 None),
1400 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 None),
1403 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 None),
1406 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 None),
1409 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 None),
1412 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 None),
1415 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 None),
1418 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001420 None),
1421 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xcd\x81',
1423 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001426 None),
1427 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001428 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001429 None),
1430 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001431 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 None),
1433 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001435 None),
1436 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001437 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 None),
1439 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001440 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 None),
1442 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001443 (b'foo\xef\xb9\xb6bar',
1444 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001446 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001447 None),
1448 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001449 (b'\xd8\xa71\xd8\xa8',
1450 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001451 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001452 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001453 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001454 # None),
1455 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 # 3.44 Larger test (shrinking).
1457 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001458 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1459 b'\xaa\xce\xb0\xe2\x80\x80',
1460 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001461 # 3.45 Larger test (expanding).
1462 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001463 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1464 b'\x80',
1465 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1466 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1467 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001468 ]
1469
1470
1471class NameprepTest(unittest.TestCase):
1472 def test_nameprep(self):
1473 from encodings.idna import nameprep
1474 for pos, (orig, prepped) in enumerate(nameprep_tests):
1475 if orig is None:
1476 # Skipped
1477 continue
1478 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001479 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 if prepped is None:
1481 # Input contains prohibited characters
1482 self.assertRaises(UnicodeError, nameprep, orig)
1483 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001484 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001486 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001487 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001488 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001490class IDNACodecTest(unittest.TestCase):
1491 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001492 self.assertEqual(str(b"python.org", "idna"), "python.org")
1493 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1494 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1495 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001496
1497 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001498 self.assertEqual("python.org".encode("idna"), b"python.org")
1499 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1500 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1501 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001502
Martin v. Löwis8b595142005-08-25 11:03:38 +00001503 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001504 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001505 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001506 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001507
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001508 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001509 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001510 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001511 "python.org"
1512 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001513 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001514 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001515 "python.org."
1516 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001517 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001518 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001519 "pyth\xf6n.org."
1520 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001521 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001522 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001523 "pyth\xf6n.org."
1524 )
1525
1526 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001527 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1528 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1529 self.assertEqual(decoder.decode(b"rg"), "")
1530 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001531
1532 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1534 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1535 self.assertEqual(decoder.decode(b"rg."), "org.")
1536 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537
1538 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001539 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001540 b"".join(codecs.iterencode("python.org", "idna")),
1541 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001542 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001543 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001544 b"".join(codecs.iterencode("python.org.", "idna")),
1545 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001546 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001547 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001548 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1549 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001551 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001552 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1553 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001554 )
1555
1556 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001557 self.assertEqual(encoder.encode("\xe4x"), b"")
1558 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1559 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001560
1561 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(encoder.encode("\xe4x"), b"")
1563 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1564 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001565
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001566class CodecsModuleTest(unittest.TestCase):
1567
1568 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001569 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1570 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001571 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001572 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001573 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001574
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001575 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001576 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1577 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001578 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001579 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001581 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001582
1583 def test_register(self):
1584 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001585 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001586
1587 def test_lookup(self):
1588 self.assertRaises(TypeError, codecs.lookup)
1589 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001590 self.assertRaises(LookupError, codecs.lookup, " ")
1591
1592 def test_getencoder(self):
1593 self.assertRaises(TypeError, codecs.getencoder)
1594 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1595
1596 def test_getdecoder(self):
1597 self.assertRaises(TypeError, codecs.getdecoder)
1598 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1599
1600 def test_getreader(self):
1601 self.assertRaises(TypeError, codecs.getreader)
1602 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1603
1604 def test_getwriter(self):
1605 self.assertRaises(TypeError, codecs.getwriter)
1606 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001607
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001608 def test_lookup_issue1813(self):
1609 # Issue #1813: under Turkish locales, lookup of some codecs failed
1610 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001611 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001612 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1613 try:
1614 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1615 except locale.Error:
1616 # Unsupported locale on this system
1617 self.skipTest('test needs Turkish locale')
1618 c = codecs.lookup('ASCII')
1619 self.assertEqual(c.name, 'ascii')
1620
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001621class StreamReaderTest(unittest.TestCase):
1622
1623 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001624 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001625 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001626
1627 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001628 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001629 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001630
Thomas Wouters89f507f2006-12-13 04:49:30 +00001631class EncodedFileTest(unittest.TestCase):
1632
1633 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001634 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001635 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001636 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001637
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001638 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001639 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001640 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001641 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001642
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001643all_unicode_encodings = [
1644 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001645 "big5",
1646 "big5hkscs",
1647 "charmap",
1648 "cp037",
1649 "cp1006",
1650 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001651 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001652 "cp1140",
1653 "cp1250",
1654 "cp1251",
1655 "cp1252",
1656 "cp1253",
1657 "cp1254",
1658 "cp1255",
1659 "cp1256",
1660 "cp1257",
1661 "cp1258",
1662 "cp424",
1663 "cp437",
1664 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001665 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001666 "cp737",
1667 "cp775",
1668 "cp850",
1669 "cp852",
1670 "cp855",
1671 "cp856",
1672 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001673 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001674 "cp860",
1675 "cp861",
1676 "cp862",
1677 "cp863",
1678 "cp864",
1679 "cp865",
1680 "cp866",
1681 "cp869",
1682 "cp874",
1683 "cp875",
1684 "cp932",
1685 "cp949",
1686 "cp950",
1687 "euc_jis_2004",
1688 "euc_jisx0213",
1689 "euc_jp",
1690 "euc_kr",
1691 "gb18030",
1692 "gb2312",
1693 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001694 "hp_roman8",
1695 "hz",
1696 "idna",
1697 "iso2022_jp",
1698 "iso2022_jp_1",
1699 "iso2022_jp_2",
1700 "iso2022_jp_2004",
1701 "iso2022_jp_3",
1702 "iso2022_jp_ext",
1703 "iso2022_kr",
1704 "iso8859_1",
1705 "iso8859_10",
1706 "iso8859_11",
1707 "iso8859_13",
1708 "iso8859_14",
1709 "iso8859_15",
1710 "iso8859_16",
1711 "iso8859_2",
1712 "iso8859_3",
1713 "iso8859_4",
1714 "iso8859_5",
1715 "iso8859_6",
1716 "iso8859_7",
1717 "iso8859_8",
1718 "iso8859_9",
1719 "johab",
1720 "koi8_r",
1721 "koi8_u",
1722 "latin_1",
1723 "mac_cyrillic",
1724 "mac_greek",
1725 "mac_iceland",
1726 "mac_latin2",
1727 "mac_roman",
1728 "mac_turkish",
1729 "palmos",
1730 "ptcp154",
1731 "punycode",
1732 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001733 "shift_jis",
1734 "shift_jis_2004",
1735 "shift_jisx0213",
1736 "tis_620",
1737 "unicode_escape",
1738 "unicode_internal",
1739 "utf_16",
1740 "utf_16_be",
1741 "utf_16_le",
1742 "utf_7",
1743 "utf_8",
1744]
1745
1746if hasattr(codecs, "mbcs_encode"):
1747 all_unicode_encodings.append("mbcs")
1748
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749# The following encoding is not tested, because it's not supposed
1750# to work:
1751# "undefined"
1752
1753# The following encodings don't work in stateful mode
1754broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001755 "punycode",
1756 "unicode_internal"
1757]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001758broken_incremental_coders = broken_unicode_with_streams + [
1759 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001760]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001761
Walter Dörwald3abcb012007-04-16 22:10:50 +00001762class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001763 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001764 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001765 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001766 name = codecs.lookup(encoding).name
1767 if encoding.endswith("_codec"):
1768 name += "_codec"
1769 elif encoding == "latin_1":
1770 name = "latin_1"
1771 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001772
Ezio Melottiadc417c2011-11-17 12:23:34 +02001773 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001774 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001775 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001776 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001777 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001778 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001779
1780 if encoding not in broken_unicode_with_streams:
1781 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001782 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001783 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001784 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001785 for c in s:
1786 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001787 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001788 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001789 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001790 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001791 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001792 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001793 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001794 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001795 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001796 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001797
Thomas Wouters89f507f2006-12-13 04:49:30 +00001798 if encoding not in broken_incremental_coders:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001799 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001800 try:
1801 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001802 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001803 pass
1804 else:
1805 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001806 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001807 for c in s:
1808 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001809 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001810 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001811 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001812 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001813 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001814 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001815 self.assertEqual(decodedresult, s,
1816 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001817
1818 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001819 result = "".join(codecs.iterdecode(
1820 codecs.iterencode(s, encoding), encoding))
1821 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001822
1823 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001824 result = "".join(codecs.iterdecode(
1825 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001826 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001827
Victor Stinner554f3f02010-06-16 23:33:54 +00001828 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001829 # check incremental decoder/encoder with errors argument
1830 try:
1831 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001832 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001833 pass
1834 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001835 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001836 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001837 decodedresult = "".join(decoder.decode(bytes([c]))
1838 for c in encodedresult)
1839 self.assertEqual(decodedresult, s,
1840 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001841
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001842 @support.cpython_only
1843 def test_basics_capi(self):
1844 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1845 s = "abc123" # all codecs should be able to encode these
1846 for encoding in all_unicode_encodings:
1847 if encoding not in broken_incremental_coders:
1848 # check incremental decoder/encoder (fetched via the C API)
1849 try:
1850 cencoder = codec_incrementalencoder(encoding)
1851 except LookupError: # no IncrementalEncoder
1852 pass
1853 else:
1854 # check C API
1855 encodedresult = b""
1856 for c in s:
1857 encodedresult += cencoder.encode(c)
1858 encodedresult += cencoder.encode("", True)
1859 cdecoder = codec_incrementaldecoder(encoding)
1860 decodedresult = ""
1861 for c in encodedresult:
1862 decodedresult += cdecoder.decode(bytes([c]))
1863 decodedresult += cdecoder.decode(b"", True)
1864 self.assertEqual(decodedresult, s,
1865 "encoding=%r" % encoding)
1866
1867 if encoding not in ("idna", "mbcs"):
1868 # check incremental decoder/encoder with errors argument
1869 try:
1870 cencoder = codec_incrementalencoder(encoding, "ignore")
1871 except LookupError: # no IncrementalEncoder
1872 pass
1873 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001874 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001875 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1876 decodedresult = "".join(cdecoder.decode(bytes([c]))
1877 for c in encodedresult)
1878 self.assertEqual(decodedresult, s,
1879 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001880
Walter Dörwald729c31f2005-03-14 19:06:30 +00001881 def test_seek(self):
1882 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001883 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001884 for encoding in all_unicode_encodings:
1885 if encoding == "idna": # FIXME: See SF bug #1163178
1886 continue
1887 if encoding in broken_unicode_with_streams:
1888 continue
Victor Stinner05010702011-05-27 16:50:40 +02001889 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001890 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001891 # Test that calling seek resets the internal codec state and buffers
1892 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001893 data = reader.read()
1894 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001895
Walter Dörwalde22d3392005-11-17 08:52:34 +00001896 def test_bad_decode_args(self):
1897 for encoding in all_unicode_encodings:
1898 decoder = codecs.getdecoder(encoding)
1899 self.assertRaises(TypeError, decoder)
1900 if encoding not in ("idna", "punycode"):
1901 self.assertRaises(TypeError, decoder, 42)
1902
1903 def test_bad_encode_args(self):
1904 for encoding in all_unicode_encodings:
1905 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001906 with support.check_warnings():
1907 # unicode-internal has been deprecated
1908 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001909
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001910 def test_encoding_map_type_initialized(self):
1911 from encodings import cp1140
1912 # This used to crash, we are only verifying there's no crash.
1913 table_type = type(cp1140.encoding_table)
1914 self.assertEqual(table_type, table_type)
1915
Walter Dörwald3abcb012007-04-16 22:10:50 +00001916 def test_decoder_state(self):
1917 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001918 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001919 for encoding in all_unicode_encodings:
1920 if encoding not in broken_incremental_coders:
1921 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1922 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1923
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001924class CharmapTest(unittest.TestCase):
1925 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001926 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001927 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001928 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001929 )
1930
Ezio Melottib3aedd42010-11-20 19:04:17 +00001931 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001932 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1933 ("\U0010FFFFbc", 3)
1934 )
1935
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001936 self.assertRaises(UnicodeDecodeError,
1937 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1938 )
1939
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001940 self.assertRaises(UnicodeDecodeError,
1941 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1942 )
1943
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001944 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001945 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001946 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001947 )
1948
Ezio Melottib3aedd42010-11-20 19:04:17 +00001949 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001950 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001951 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001952 )
1953
Ezio Melottib3aedd42010-11-20 19:04:17 +00001954 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001955 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001956 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001957 )
1958
Ezio Melottib3aedd42010-11-20 19:04:17 +00001959 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001960 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001961 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001962 )
1963
Guido van Rossum805365e2007-05-07 22:24:25 +00001964 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001965 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001966 codecs.charmap_decode(allbytes, "ignore", ""),
1967 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001968 )
1969
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001970 def test_decode_with_int2str_map(self):
1971 self.assertEqual(
1972 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1973 {0: 'a', 1: 'b', 2: 'c'}),
1974 ("abc", 3)
1975 )
1976
1977 self.assertEqual(
1978 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1979 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1980 ("AaBbCc", 3)
1981 )
1982
1983 self.assertEqual(
1984 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1985 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1986 ("\U0010FFFFbc", 3)
1987 )
1988
1989 self.assertEqual(
1990 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1991 {0: 'a', 1: 'b', 2: ''}),
1992 ("ab", 3)
1993 )
1994
1995 self.assertRaises(UnicodeDecodeError,
1996 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1997 {0: 'a', 1: 'b'}
1998 )
1999
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002000 self.assertRaises(UnicodeDecodeError,
2001 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2002 {0: 'a', 1: 'b', 2: None}
2003 )
2004
2005 # Issue #14850
2006 self.assertRaises(UnicodeDecodeError,
2007 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2008 {0: 'a', 1: 'b', 2: '\ufffe'}
2009 )
2010
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002011 self.assertEqual(
2012 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2013 {0: 'a', 1: 'b'}),
2014 ("ab\ufffd", 3)
2015 )
2016
2017 self.assertEqual(
2018 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2019 {0: 'a', 1: 'b', 2: None}),
2020 ("ab\ufffd", 3)
2021 )
2022
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002023 # Issue #14850
2024 self.assertEqual(
2025 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2026 {0: 'a', 1: 'b', 2: '\ufffe'}),
2027 ("ab\ufffd", 3)
2028 )
2029
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002030 self.assertEqual(
2031 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2032 {0: 'a', 1: 'b'}),
2033 ("ab", 3)
2034 )
2035
2036 self.assertEqual(
2037 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2038 {0: 'a', 1: 'b', 2: None}),
2039 ("ab", 3)
2040 )
2041
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002042 # Issue #14850
2043 self.assertEqual(
2044 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2045 {0: 'a', 1: 'b', 2: '\ufffe'}),
2046 ("ab", 3)
2047 )
2048
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002049 allbytes = bytes(range(256))
2050 self.assertEqual(
2051 codecs.charmap_decode(allbytes, "ignore", {}),
2052 ("", len(allbytes))
2053 )
2054
2055 def test_decode_with_int2int_map(self):
2056 a = ord('a')
2057 b = ord('b')
2058 c = ord('c')
2059
2060 self.assertEqual(
2061 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2062 {0: a, 1: b, 2: c}),
2063 ("abc", 3)
2064 )
2065
2066 # Issue #15379
2067 self.assertEqual(
2068 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2069 {0: 0x10FFFF, 1: b, 2: c}),
2070 ("\U0010FFFFbc", 3)
2071 )
2072
Antoine Pitroua1f76552012-09-23 20:00:04 +02002073 self.assertEqual(
2074 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2075 {0: sys.maxunicode, 1: b, 2: c}),
2076 (chr(sys.maxunicode) + "bc", 3)
2077 )
2078
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002079 self.assertRaises(TypeError,
2080 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002081 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002082 )
2083
2084 self.assertRaises(UnicodeDecodeError,
2085 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2086 {0: a, 1: b},
2087 )
2088
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002089 self.assertRaises(UnicodeDecodeError,
2090 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2091 {0: a, 1: b, 2: 0xFFFE},
2092 )
2093
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002094 self.assertEqual(
2095 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2096 {0: a, 1: b}),
2097 ("ab\ufffd", 3)
2098 )
2099
2100 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002101 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2102 {0: a, 1: b, 2: 0xFFFE}),
2103 ("ab\ufffd", 3)
2104 )
2105
2106 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002107 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2108 {0: a, 1: b}),
2109 ("ab", 3)
2110 )
2111
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002112 self.assertEqual(
2113 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2114 {0: a, 1: b, 2: 0xFFFE}),
2115 ("ab", 3)
2116 )
2117
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002118
Thomas Wouters89f507f2006-12-13 04:49:30 +00002119class WithStmtTest(unittest.TestCase):
2120 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002121 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002122 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2123 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002124
2125 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002126 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002127 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002128 with codecs.StreamReaderWriter(f, info.streamreader,
2129 info.streamwriter, 'strict') as srw:
2130 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002131
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002132class TypesTest(unittest.TestCase):
2133 def test_decode_unicode(self):
2134 # Most decoders don't accept unicode input
2135 decoders = [
2136 codecs.utf_7_decode,
2137 codecs.utf_8_decode,
2138 codecs.utf_16_le_decode,
2139 codecs.utf_16_be_decode,
2140 codecs.utf_16_ex_decode,
2141 codecs.utf_32_decode,
2142 codecs.utf_32_le_decode,
2143 codecs.utf_32_be_decode,
2144 codecs.utf_32_ex_decode,
2145 codecs.latin_1_decode,
2146 codecs.ascii_decode,
2147 codecs.charmap_decode,
2148 ]
2149 if hasattr(codecs, "mbcs_decode"):
2150 decoders.append(codecs.mbcs_decode)
2151 for decoder in decoders:
2152 self.assertRaises(TypeError, decoder, "xxx")
2153
2154 def test_unicode_escape(self):
2155 # Escape-decoding an unicode string is supported ang gives the same
2156 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002157 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2158 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2159 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2160 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002161
Victor Stinnere3b47152011-12-09 20:49:49 +01002162 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2163 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2164
2165 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2166 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2167
Serhiy Storchakad6793772013-01-29 10:20:44 +02002168
2169class UnicodeEscapeTest(unittest.TestCase):
2170 def test_empty(self):
2171 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2172 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2173
2174 def test_raw_encode(self):
2175 encode = codecs.unicode_escape_encode
2176 for b in range(32, 127):
2177 if b != b'\\'[0]:
2178 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2179
2180 def test_raw_decode(self):
2181 decode = codecs.unicode_escape_decode
2182 for b in range(256):
2183 if b != b'\\'[0]:
2184 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2185
2186 def test_escape_encode(self):
2187 encode = codecs.unicode_escape_encode
2188 check = coding_checker(self, encode)
2189 check('\t', br'\t')
2190 check('\n', br'\n')
2191 check('\r', br'\r')
2192 check('\\', br'\\')
2193 for b in range(32):
2194 if chr(b) not in '\t\n\r':
2195 check(chr(b), ('\\x%02x' % b).encode())
2196 for b in range(127, 256):
2197 check(chr(b), ('\\x%02x' % b).encode())
2198 check('\u20ac', br'\u20ac')
2199 check('\U0001d120', br'\U0001d120')
2200
2201 def test_escape_decode(self):
2202 decode = codecs.unicode_escape_decode
2203 check = coding_checker(self, decode)
2204 check(b"[\\\n]", "[]")
2205 check(br'[\"]', '["]')
2206 check(br"[\']", "[']")
2207 check(br"[\\]", r"[\]")
2208 check(br"[\a]", "[\x07]")
2209 check(br"[\b]", "[\x08]")
2210 check(br"[\t]", "[\x09]")
2211 check(br"[\n]", "[\x0a]")
2212 check(br"[\v]", "[\x0b]")
2213 check(br"[\f]", "[\x0c]")
2214 check(br"[\r]", "[\x0d]")
2215 check(br"[\7]", "[\x07]")
2216 check(br"[\8]", r"[\8]")
2217 check(br"[\78]", "[\x078]")
2218 check(br"[\41]", "[!]")
2219 check(br"[\418]", "[!8]")
2220 check(br"[\101]", "[A]")
2221 check(br"[\1010]", "[A0]")
2222 check(br"[\x41]", "[A]")
2223 check(br"[\x410]", "[A0]")
2224 check(br"\u20ac", "\u20ac")
2225 check(br"\U0001d120", "\U0001d120")
2226 for b in range(256):
2227 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2228 check(b'\\' + bytes([b]), '\\' + chr(b))
2229
2230 def test_decode_errors(self):
2231 decode = codecs.unicode_escape_decode
2232 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2233 for i in range(d):
2234 self.assertRaises(UnicodeDecodeError, decode,
2235 b"\\" + c + b"0"*i)
2236 self.assertRaises(UnicodeDecodeError, decode,
2237 b"[\\" + c + b"0"*i + b"]")
2238 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2239 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2240 self.assertEqual(decode(data, "replace"),
2241 ("[\ufffd]\ufffd", len(data)))
2242 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2243 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2244 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2245
2246
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002247class RawUnicodeEscapeTest(unittest.TestCase):
2248 def test_empty(self):
2249 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2250 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2251
2252 def test_raw_encode(self):
2253 encode = codecs.raw_unicode_escape_encode
2254 for b in range(256):
2255 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2256
2257 def test_raw_decode(self):
2258 decode = codecs.raw_unicode_escape_decode
2259 for b in range(256):
2260 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2261
2262 def test_escape_encode(self):
2263 encode = codecs.raw_unicode_escape_encode
2264 check = coding_checker(self, encode)
2265 for b in range(256):
2266 if b not in b'uU':
2267 check('\\' + chr(b), b'\\' + bytes([b]))
2268 check('\u20ac', br'\u20ac')
2269 check('\U0001d120', br'\U0001d120')
2270
2271 def test_escape_decode(self):
2272 decode = codecs.raw_unicode_escape_decode
2273 check = coding_checker(self, decode)
2274 for b in range(256):
2275 if b not in b'uU':
2276 check(b'\\' + bytes([b]), '\\' + chr(b))
2277 check(br"\u20ac", "\u20ac")
2278 check(br"\U0001d120", "\U0001d120")
2279
2280 def test_decode_errors(self):
2281 decode = codecs.raw_unicode_escape_decode
2282 for c, d in (b'u', 4), (b'U', 4):
2283 for i in range(d):
2284 self.assertRaises(UnicodeDecodeError, decode,
2285 b"\\" + c + b"0"*i)
2286 self.assertRaises(UnicodeDecodeError, decode,
2287 b"[\\" + c + b"0"*i + b"]")
2288 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2289 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2290 self.assertEqual(decode(data, "replace"),
2291 ("[\ufffd]\ufffd", len(data)))
2292 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2293 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2294 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2295
2296
Martin v. Löwis43c57782009-05-10 08:15:24 +00002297class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002298
2299 def test_utf8(self):
2300 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002301 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002302 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002303 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002304 b"foo\x80bar")
2305 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002306 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002307 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002308 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002309 b"\xed\xb0\x80")
2310
2311 def test_ascii(self):
2312 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002313 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002314 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002315 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002316 b"foo\x80bar")
2317
2318 def test_charmap(self):
2319 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002320 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002321 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002322 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002323 b"foo\xa5bar")
2324
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002325 def test_latin1(self):
2326 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002327 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002328 b"\xe4\xeb\xef\xf6\xfc")
2329
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002330
Victor Stinner3fed0872010-05-22 02:16:27 +00002331class BomTest(unittest.TestCase):
2332 def test_seek0(self):
2333 data = "1234567890"
2334 tests = ("utf-16",
2335 "utf-16-le",
2336 "utf-16-be",
2337 "utf-32",
2338 "utf-32-le",
2339 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002340 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002341 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002342 # Check if the BOM is written only once
2343 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002344 f.write(data)
2345 f.write(data)
2346 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002347 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002348 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002349 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002350
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002351 # Check that the BOM is written after a seek(0)
2352 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2353 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002354 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002355 f.seek(0)
2356 f.write(data)
2357 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002358 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002359
2360 # (StreamWriter) Check that the BOM is written after a seek(0)
2361 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002362 f.writer.write(data[0])
2363 self.assertNotEqual(f.writer.tell(), 0)
2364 f.writer.seek(0)
2365 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002366 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002367 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002368
Victor Stinner05010702011-05-27 16:50:40 +02002369 # Check that the BOM is not written after a seek() at a position
2370 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002371 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2372 f.write(data)
2373 f.seek(f.tell())
2374 f.write(data)
2375 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002376 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002377
Victor Stinner05010702011-05-27 16:50:40 +02002378 # (StreamWriter) Check that the BOM is not written after a seek()
2379 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002380 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002381 f.writer.write(data)
2382 f.writer.seek(f.writer.tell())
2383 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002384 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002385 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002386
Victor Stinner3fed0872010-05-22 02:16:27 +00002387
Georg Brandl02524622010-12-02 18:06:51 +00002388bytes_transform_encodings = [
2389 "base64_codec",
2390 "uu_codec",
2391 "quopri_codec",
2392 "hex_codec",
2393]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002394
2395transform_aliases = {
2396 "base64_codec": ["base64", "base_64"],
2397 "uu_codec": ["uu"],
2398 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2399 "hex_codec": ["hex"],
2400 "rot_13": ["rot13"],
2401}
2402
Georg Brandl02524622010-12-02 18:06:51 +00002403try:
2404 import zlib
2405except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002406 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002407else:
2408 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002409 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002410try:
2411 import bz2
2412except ImportError:
2413 pass
2414else:
2415 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002416 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002417
2418class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002419
Georg Brandl02524622010-12-02 18:06:51 +00002420 def test_basics(self):
2421 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002422 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002423 with self.subTest(encoding=encoding):
2424 # generic codecs interface
2425 (o, size) = codecs.getencoder(encoding)(binput)
2426 self.assertEqual(size, len(binput))
2427 (i, size) = codecs.getdecoder(encoding)(o)
2428 self.assertEqual(size, len(o))
2429 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002430
Georg Brandl02524622010-12-02 18:06:51 +00002431 def test_read(self):
2432 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002433 with self.subTest(encoding=encoding):
2434 sin = codecs.encode(b"\x80", encoding)
2435 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2436 sout = reader.read()
2437 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002438
2439 def test_readline(self):
2440 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002441 with self.subTest(encoding=encoding):
2442 sin = codecs.encode(b"\x80", encoding)
2443 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2444 sout = reader.readline()
2445 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002446
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002447 def test_buffer_api_usage(self):
2448 # We check all the transform codecs accept memoryview input
2449 # for encoding and decoding
2450 # and also that they roundtrip correctly
2451 original = b"12345\x80"
2452 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002453 with self.subTest(encoding=encoding):
2454 data = original
2455 view = memoryview(data)
2456 data = codecs.encode(data, encoding)
2457 view_encoded = codecs.encode(view, encoding)
2458 self.assertEqual(view_encoded, data)
2459 view = memoryview(data)
2460 data = codecs.decode(data, encoding)
2461 self.assertEqual(data, original)
2462 view_decoded = codecs.decode(view, encoding)
2463 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002464
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002465 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002466 # Check binary -> binary codecs give a good error for str input
2467 bad_input = "bad input type"
2468 for encoding in bytes_transform_encodings:
2469 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002470 fmt = ( "{!r} is not a text encoding; "
2471 "use codecs.encode\(\) to handle arbitrary codecs")
2472 msg = fmt.format(encoding)
2473 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002474 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002475 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002476
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002477 def test_text_to_binary_blacklists_text_transforms(self):
2478 # Check str.encode gives a good error message for str -> str codecs
2479 msg = (r"^'rot_13' is not a text encoding; "
2480 "use codecs.encode\(\) to handle arbitrary codecs")
2481 with self.assertRaisesRegex(LookupError, msg):
2482 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002483
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002484 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002485 # Check bytes.decode and bytearray.decode give a good error
2486 # message for binary -> binary codecs
2487 data = b"encode first to ensure we meet any format restrictions"
2488 for encoding in bytes_transform_encodings:
2489 with self.subTest(encoding=encoding):
2490 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002491 fmt = (r"{!r} is not a text encoding; "
2492 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002493 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002494 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002495 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002496 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002497 bytearray(encoded_data).decode(encoding)
2498
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002499 def test_binary_to_text_blacklists_text_transforms(self):
2500 # Check str -> str codec gives a good error for binary input
2501 for bad_input in (b"immutable", bytearray(b"mutable")):
2502 with self.subTest(bad_input=bad_input):
2503 msg = (r"^'rot_13' is not a text encoding; "
2504 "use codecs.decode\(\) to handle arbitrary codecs")
2505 with self.assertRaisesRegex(LookupError, msg) as failure:
2506 bad_input.decode("rot_13")
2507 self.assertIsNone(failure.exception.__cause__)
2508
Zachary Wareefa2e042013-12-30 14:54:11 -06002509 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002510 def test_custom_zlib_error_is_wrapped(self):
2511 # Check zlib codec gives a good error for malformed input
2512 msg = "^decoding with 'zlib_codec' codec failed"
2513 with self.assertRaisesRegex(Exception, msg) as failure:
2514 codecs.decode(b"hello", "zlib_codec")
2515 self.assertIsInstance(failure.exception.__cause__,
2516 type(failure.exception))
2517
2518 def test_custom_hex_error_is_wrapped(self):
2519 # Check hex codec gives a good error for malformed input
2520 msg = "^decoding with 'hex_codec' codec failed"
2521 with self.assertRaisesRegex(Exception, msg) as failure:
2522 codecs.decode(b"hello", "hex_codec")
2523 self.assertIsInstance(failure.exception.__cause__,
2524 type(failure.exception))
2525
2526 # Unfortunately, the bz2 module throws OSError, which the codec
2527 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002528
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002529 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2530 def test_aliases(self):
2531 for codec_name, aliases in transform_aliases.items():
2532 expected_name = codecs.lookup(codec_name).name
2533 for alias in aliases:
2534 with self.subTest(alias=alias):
2535 info = codecs.lookup(alias)
2536 self.assertEqual(info.name, expected_name)
2537
Nick Coghlan8b097b42013-11-13 23:49:21 +10002538
2539# The codec system tries to wrap exceptions in order to ensure the error
2540# mentions the operation being performed and the codec involved. We
2541# currently *only* want this to happen for relatively stateless
2542# exceptions, where the only significant information they contain is their
2543# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002544
2545# Use a local codec registry to avoid appearing to leak objects when
2546# registering multiple seach functions
2547_TEST_CODECS = {}
2548
2549def _get_test_codec(codec_name):
2550 return _TEST_CODECS.get(codec_name)
2551codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2552
Nick Coghlan8b097b42013-11-13 23:49:21 +10002553class ExceptionChainingTest(unittest.TestCase):
2554
2555 def setUp(self):
2556 # There's no way to unregister a codec search function, so we just
2557 # ensure we render this one fairly harmless after the test
2558 # case finishes by using the test case repr as the codec name
2559 # The codecs module normalizes codec names, although this doesn't
2560 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002561 # We also make sure we use a truly unique id for the custom codec
2562 # to avoid issues with the codec cache when running these tests
2563 # multiple times (e.g. when hunting for refleaks)
2564 unique_id = repr(self) + str(id(self))
2565 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2566
2567 # We store the object to raise on the instance because of a bad
2568 # interaction between the codec caching (which means we can't
2569 # recreate the codec entry) and regrtest refleak hunting (which
2570 # runs the same test instance multiple times). This means we
2571 # need to ensure the codecs call back in to the instance to find
2572 # out which exception to raise rather than binding them in a
2573 # closure to an object that may change on the next run
2574 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002575
Nick Coghlan4e553e22013-11-16 00:35:34 +10002576 def tearDown(self):
2577 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002578
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002579 def set_codec(self, encode, decode):
2580 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002581 name=self.codec_name)
2582 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002583
2584 @contextlib.contextmanager
2585 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002586 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002587 operation, self.codec_name, exc_type.__name__, msg)
2588 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2589 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002590 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002591 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002592
2593 def raise_obj(self, *args, **kwds):
2594 # Helper to dynamically change the object raised by a test codec
2595 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002596
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002597 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002598 self.obj_to_raise = obj_to_raise
2599 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002600 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002601 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002602 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002603 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002604 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002605 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002606 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002607 codecs.decode(b"bytes input", self.codec_name)
2608
2609 def test_raise_by_type(self):
2610 self.check_wrapped(RuntimeError, "")
2611
2612 def test_raise_by_value(self):
2613 msg = "This should be wrapped"
2614 self.check_wrapped(RuntimeError(msg), msg)
2615
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002616 def test_raise_grandchild_subclass_exact_size(self):
2617 msg = "This should be wrapped"
2618 class MyRuntimeError(RuntimeError):
2619 __slots__ = ()
2620 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2621
2622 def test_raise_subclass_with_weakref_support(self):
2623 msg = "This should be wrapped"
2624 class MyRuntimeError(RuntimeError):
2625 pass
2626 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2627
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002628 def check_not_wrapped(self, obj_to_raise, msg):
2629 def raise_obj(*args, **kwds):
2630 raise obj_to_raise
2631 self.set_codec(raise_obj, raise_obj)
2632 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002634 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002635 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002636 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002637 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002638 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002639 codecs.decode(b"bytes input", self.codec_name)
2640
2641 def test_init_override_is_not_wrapped(self):
2642 class CustomInit(RuntimeError):
2643 def __init__(self):
2644 pass
2645 self.check_not_wrapped(CustomInit, "")
2646
2647 def test_new_override_is_not_wrapped(self):
2648 class CustomNew(RuntimeError):
2649 def __new__(cls):
2650 return super().__new__(cls)
2651 self.check_not_wrapped(CustomNew, "")
2652
2653 def test_instance_attribute_is_not_wrapped(self):
2654 msg = "This should NOT be wrapped"
2655 exc = RuntimeError(msg)
2656 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002657 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002658
2659 def test_non_str_arg_is_not_wrapped(self):
2660 self.check_not_wrapped(RuntimeError(1), "1")
2661
2662 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002663 msg_re = r"^\('a', 'b', 'c'\)$"
2664 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002665
2666 # http://bugs.python.org/issue19609
2667 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002668 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002669 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002670 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002671 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002672 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002673 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002674 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002675 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002676 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002677 codecs.decode(b"bytes input", self.codec_name)
2678
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002679 def test_unflagged_non_text_codec_handling(self):
2680 # The stdlib non-text codecs are now marked so they're
2681 # pre-emptively skipped by the text model related methods
2682 # However, third party codecs won't be flagged, so we still make
2683 # sure the case where an inappropriate output type is produced is
2684 # handled appropriately
2685 def encode_to_str(*args, **kwds):
2686 return "not bytes!", 0
2687 def decode_to_bytes(*args, **kwds):
2688 return b"not str!", 0
2689 self.set_codec(encode_to_str, decode_to_bytes)
2690 # No input or output type checks on the codecs module functions
2691 encoded = codecs.encode(None, self.codec_name)
2692 self.assertEqual(encoded, "not bytes!")
2693 decoded = codecs.decode(None, self.codec_name)
2694 self.assertEqual(decoded, b"not str!")
2695 # Text model methods should complain
2696 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2697 "use codecs.encode\(\) to encode to arbitrary types$")
2698 msg = fmt.format(self.codec_name)
2699 with self.assertRaisesRegex(TypeError, msg):
2700 "str_input".encode(self.codec_name)
2701 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2702 "use codecs.decode\(\) to decode to arbitrary types$")
2703 msg = fmt.format(self.codec_name)
2704 with self.assertRaisesRegex(TypeError, msg):
2705 b"bytes input".decode(self.codec_name)
2706
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002707
Georg Brandl02524622010-12-02 18:06:51 +00002708
Victor Stinner62be4fb2011-10-18 21:46:37 +02002709@unittest.skipUnless(sys.platform == 'win32',
2710 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002711class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002712 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002713 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002714
Victor Stinner3a50e702011-10-18 21:21:00 +02002715 def test_invalid_code_page(self):
2716 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2717 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002718 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2719 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002720
2721 def test_code_page_name(self):
2722 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2723 codecs.code_page_encode, 932, '\xff')
2724 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2725 codecs.code_page_decode, 932, b'\x81\x00')
2726 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2727 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2728
2729 def check_decode(self, cp, tests):
2730 for raw, errors, expected in tests:
2731 if expected is not None:
2732 try:
2733 decoded = codecs.code_page_decode(cp, raw, errors)
2734 except UnicodeDecodeError as err:
2735 self.fail('Unable to decode %a from "cp%s" with '
2736 'errors=%r: %s' % (raw, cp, errors, err))
2737 self.assertEqual(decoded[0], expected,
2738 '%a.decode("cp%s", %r)=%a != %a'
2739 % (raw, cp, errors, decoded[0], expected))
2740 # assert 0 <= decoded[1] <= len(raw)
2741 self.assertGreaterEqual(decoded[1], 0)
2742 self.assertLessEqual(decoded[1], len(raw))
2743 else:
2744 self.assertRaises(UnicodeDecodeError,
2745 codecs.code_page_decode, cp, raw, errors)
2746
2747 def check_encode(self, cp, tests):
2748 for text, errors, expected in tests:
2749 if expected is not None:
2750 try:
2751 encoded = codecs.code_page_encode(cp, text, errors)
2752 except UnicodeEncodeError as err:
2753 self.fail('Unable to encode %a to "cp%s" with '
2754 'errors=%r: %s' % (text, cp, errors, err))
2755 self.assertEqual(encoded[0], expected,
2756 '%a.encode("cp%s", %r)=%a != %a'
2757 % (text, cp, errors, encoded[0], expected))
2758 self.assertEqual(encoded[1], len(text))
2759 else:
2760 self.assertRaises(UnicodeEncodeError,
2761 codecs.code_page_encode, cp, text, errors)
2762
2763 def test_cp932(self):
2764 self.check_encode(932, (
2765 ('abc', 'strict', b'abc'),
2766 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002767 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002768 ('\xff', 'strict', None),
2769 ('[\xff]', 'ignore', b'[]'),
2770 ('[\xff]', 'replace', b'[y]'),
2771 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002772 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2773 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002774 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002775 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002776 (b'abc', 'strict', 'abc'),
2777 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2778 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002779 (b'[\xff]', 'strict', None),
2780 (b'[\xff]', 'ignore', '[]'),
2781 (b'[\xff]', 'replace', '[\ufffd]'),
2782 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002783 (b'\x81\x00abc', 'strict', None),
2784 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002785 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2786 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002787
2788 def test_cp1252(self):
2789 self.check_encode(1252, (
2790 ('abc', 'strict', b'abc'),
2791 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2792 ('\xff', 'strict', b'\xff'),
2793 ('\u0141', 'strict', None),
2794 ('\u0141', 'ignore', b''),
2795 ('\u0141', 'replace', b'L'),
2796 ))
2797 self.check_decode(1252, (
2798 (b'abc', 'strict', 'abc'),
2799 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2800 (b'\xff', 'strict', '\xff'),
2801 ))
2802
2803 def test_cp_utf7(self):
2804 cp = 65000
2805 self.check_encode(cp, (
2806 ('abc', 'strict', b'abc'),
2807 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2808 ('\U0010ffff', 'strict', b'+2//f/w-'),
2809 ('\udc80', 'strict', b'+3IA-'),
2810 ('\ufffd', 'strict', b'+//0-'),
2811 ))
2812 self.check_decode(cp, (
2813 (b'abc', 'strict', 'abc'),
2814 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2815 (b'+2//f/w-', 'strict', '\U0010ffff'),
2816 (b'+3IA-', 'strict', '\udc80'),
2817 (b'+//0-', 'strict', '\ufffd'),
2818 # invalid bytes
2819 (b'[+/]', 'strict', '[]'),
2820 (b'[\xff]', 'strict', '[\xff]'),
2821 ))
2822
Victor Stinner3a50e702011-10-18 21:21:00 +02002823 def test_multibyte_encoding(self):
2824 self.check_decode(932, (
2825 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2826 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2827 ))
2828 self.check_decode(self.CP_UTF8, (
2829 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2830 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2831 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002832 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002833 self.check_encode(self.CP_UTF8, (
2834 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2835 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2836 ))
2837
2838 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002839 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2840 self.assertEqual(decoded, ('', 0))
2841
Victor Stinner3a50e702011-10-18 21:21:00 +02002842 decoded = codecs.code_page_decode(932,
2843 b'\xe9\x80\xe9', 'strict',
2844 False)
2845 self.assertEqual(decoded, ('\u9a3e', 2))
2846
2847 decoded = codecs.code_page_decode(932,
2848 b'\xe9\x80\xe9\x80', 'strict',
2849 False)
2850 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2851
2852 decoded = codecs.code_page_decode(932,
2853 b'abc', 'strict',
2854 False)
2855 self.assertEqual(decoded, ('abc', 3))
2856
2857
Fred Drake2e2be372001-09-20 21:33:42 +00002858if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002859 unittest.main()