blob: c04ffd78e4eba0c448117c57238aaeaf5d897911 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10003import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01004import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02005import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01006import sys
7import unittest
8import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10009import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +010010
11from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020012
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020013if sys.platform == 'win32':
14 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
15else:
16 VISTA_OR_LATER = False
17
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018try:
19 import ctypes
20except ImportError:
21 ctypes = None
22 SIZEOF_WCHAR_T = -1
23else:
24 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000025
Serhiy Storchakad6793772013-01-29 10:20:44 +020026def coding_checker(self, coder):
27 def check(input, expect):
28 self.assertEqual(coder(input), (expect, len(input)))
29 return check
30
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Walter Dörwald3abcb012007-04-16 22:10:50 +000051class MixInCheckStateHandling:
52 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000053 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 d = codecs.getincrementaldecoder(encoding)()
55 part1 = d.decode(s[:i])
56 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000057 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # Check that the condition stated in the documentation for
59 # IncrementalDecoder.getstate() holds
60 if not state[1]:
61 # reset decoder to the default state without anything buffered
62 d.setstate((state[0][:0], 0))
63 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000064 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000065 # The decoder must return to the same state
66 self.assertEqual(state, d.getstate())
67 # Create a new decoder and set it to the state
68 # we extracted from the old one
69 d = codecs.getincrementaldecoder(encoding)()
70 d.setstate(state)
71 part2 = d.decode(s[i:], True)
72 self.assertEqual(u, part1+part2)
73
74 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000075 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000076 d = codecs.getincrementalencoder(encoding)()
77 part1 = d.encode(u[:i])
78 state = d.getstate()
79 d = codecs.getincrementalencoder(encoding)()
80 d.setstate(state)
81 part2 = d.encode(u[i:], True)
82 self.assertEqual(s, part1+part2)
83
Ezio Melotti5d3dba02013-01-11 06:02:07 +020084class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000086 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000087 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000088 # the StreamReader and check that the results equal the appropriate
89 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020091 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000095 result += r.read()
96 self.assertEqual(result, partialresult)
97 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000098 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000100
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 # do the check again, this time using a incremental decoder
102 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000103 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 self.assertEqual(result, partialresult)
107 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 self.assertEqual(d.decode(b"", True), "")
109 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000110
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000113 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 self.assertEqual(result, partialresult)
117 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 self.assertEqual(d.decode(b"", True), "")
119 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000120
121 # check iterdecode()
122 encoded = input.encode(self.encoding)
123 self.assertEqual(
124 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000125 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 )
127
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 def test_readline(self):
129 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000130 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 return codecs.getreader(self.encoding)(stream)
132
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200134 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 lines = []
136 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138 if not line:
139 break
140 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000141 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000142
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
144 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
145 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000146 self.assertEqual(readalllines(s, True), sexpected)
147 self.assertEqual(readalllines(s, False), sexpectednoends)
148 self.assertEqual(readalllines(s, True, 10), sexpected)
149 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200151 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000152 # Test long lines (multiple calls to read() in readline())
153 vw = []
154 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200155 for (i, lineend) in enumerate(lineends):
156 vw.append((i*200+200)*"\u3042" + lineend)
157 vwo.append((i*200+200)*"\u3042")
158 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
159 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000160
161 # Test lines where the first read might end with \r, so the
162 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000163 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200164 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000165 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000167 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 self.assertEqual(
169 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000170 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000171 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200172 self.assertEqual(
173 reader.readline(keepends=True),
174 "xxx\n",
175 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000176 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000177 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 self.assertEqual(
179 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000180 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000181 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200182 self.assertEqual(
183 reader.readline(keepends=False),
184 "xxx",
185 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000186
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200187 def test_mixed_readline_and_read(self):
188 lines = ["Humpty Dumpty sat on a wall,\n",
189 "Humpty Dumpty had a great fall.\r\n",
190 "All the king's horses and all the king's men\r",
191 "Couldn't put Humpty together again."]
192 data = ''.join(lines)
193 def getreader():
194 stream = io.BytesIO(data.encode(self.encoding))
195 return codecs.getreader(self.encoding)(stream)
196
197 # Issue #8260: Test readline() followed by read()
198 f = getreader()
199 self.assertEqual(f.readline(), lines[0])
200 self.assertEqual(f.read(), ''.join(lines[1:]))
201 self.assertEqual(f.read(), '')
202
203 # Issue #16636: Test readline() followed by readlines()
204 f = getreader()
205 self.assertEqual(f.readline(), lines[0])
206 self.assertEqual(f.readlines(), lines[1:])
207 self.assertEqual(f.read(), '')
208
209 # Test read() followed by read()
210 f = getreader()
211 self.assertEqual(f.read(size=40, chars=5), data[:5])
212 self.assertEqual(f.read(), data[5:])
213 self.assertEqual(f.read(), '')
214
215 # Issue #12446: Test read() followed by readlines()
216 f = getreader()
217 self.assertEqual(f.read(size=40, chars=5), data[:5])
218 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
219 self.assertEqual(f.read(), '')
220
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000221 def test_bug1175396(self):
222 s = [
223 '<%!--===================================================\r\n',
224 ' BLOG index page: show recent articles,\r\n',
225 ' today\'s articles, or articles of a specific date.\r\n',
226 '========================================================--%>\r\n',
227 '<%@inputencoding="ISO-8859-1"%>\r\n',
228 '<%@pagetemplate=TEMPLATE.y%>\r\n',
229 '<%@import=import frog.util, frog%>\r\n',
230 '<%@import=import frog.objects%>\r\n',
231 '<%@import=from frog.storageerrors import StorageError%>\r\n',
232 '<%\r\n',
233 '\r\n',
234 'import logging\r\n',
235 'log=logging.getLogger("Snakelets.logger")\r\n',
236 '\r\n',
237 '\r\n',
238 'user=self.SessionCtx.user\r\n',
239 'storageEngine=self.SessionCtx.storageEngine\r\n',
240 '\r\n',
241 '\r\n',
242 'def readArticlesFromDate(date, count=None):\r\n',
243 ' entryids=storageEngine.listBlogEntries(date)\r\n',
244 ' entryids.reverse() # descending\r\n',
245 ' if count:\r\n',
246 ' entryids=entryids[:count]\r\n',
247 ' try:\r\n',
248 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
249 ' except StorageError,x:\r\n',
250 ' log.error("Error loading articles: "+str(x))\r\n',
251 ' self.abort("cannot load articles")\r\n',
252 '\r\n',
253 'showdate=None\r\n',
254 '\r\n',
255 'arg=self.Request.getArg()\r\n',
256 'if arg=="today":\r\n',
257 ' #-------------------- TODAY\'S ARTICLES\r\n',
258 ' self.write("<h2>Today\'s articles</h2>")\r\n',
259 ' showdate = frog.util.isodatestr() \r\n',
260 ' entries = readArticlesFromDate(showdate)\r\n',
261 'elif arg=="active":\r\n',
262 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
263 ' self.Yredirect("active.y")\r\n',
264 'elif arg=="login":\r\n',
265 ' #-------------------- LOGIN PAGE redirect\r\n',
266 ' self.Yredirect("login.y")\r\n',
267 'elif arg=="date":\r\n',
268 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
269 ' showdate = self.Request.getParameter("date")\r\n',
270 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
271 ' entries = readArticlesFromDate(showdate)\r\n',
272 'else:\r\n',
273 ' #-------------------- RECENT ARTICLES\r\n',
274 ' self.write("<h2>Recent articles</h2>")\r\n',
275 ' dates=storageEngine.listBlogEntryDates()\r\n',
276 ' if dates:\r\n',
277 ' entries=[]\r\n',
278 ' SHOWAMOUNT=10\r\n',
279 ' for showdate in dates:\r\n',
280 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
281 ' if len(entries)>=SHOWAMOUNT:\r\n',
282 ' break\r\n',
283 ' \r\n',
284 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000285 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200286 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000287 for (i, line) in enumerate(reader):
288 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000289
290 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000291 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200292 writer = codecs.getwriter(self.encoding)(q)
293 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000294
295 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000296 writer.write("foo\r")
297 self.assertEqual(reader.readline(keepends=False), "foo")
298 writer.write("\nbar\r")
299 self.assertEqual(reader.readline(keepends=False), "")
300 self.assertEqual(reader.readline(keepends=False), "bar")
301 writer.write("baz")
302 self.assertEqual(reader.readline(keepends=False), "baz")
303 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304
305 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000306 writer.write("foo\r")
307 self.assertEqual(reader.readline(keepends=True), "foo\r")
308 writer.write("\nbar\r")
309 self.assertEqual(reader.readline(keepends=True), "\n")
310 self.assertEqual(reader.readline(keepends=True), "bar\r")
311 writer.write("baz")
312 self.assertEqual(reader.readline(keepends=True), "baz")
313 self.assertEqual(reader.readline(keepends=True), "")
314 writer.write("foo\r\n")
315 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000316
Walter Dörwald9fa09462005-01-10 12:01:39 +0000317 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000318 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
319 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
320 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000321
322 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000323 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200324 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000325 self.assertEqual(reader.readline(), s1)
326 self.assertEqual(reader.readline(), s2)
327 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000328 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000329
330 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000331 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
332 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
333 s3 = "stillokay:bbbbxx\r\n"
334 s4 = "broken!!!!badbad\r\n"
335 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336
337 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000338 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200339 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000340 self.assertEqual(reader.readline(), s1)
341 self.assertEqual(reader.readline(), s2)
342 self.assertEqual(reader.readline(), s3)
343 self.assertEqual(reader.readline(), s4)
344 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000345 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000346
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200347 ill_formed_sequence_replace = "\ufffd"
348
349 def test_lone_surrogates(self):
350 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
351 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
352 "[\\udc80]".encode(self.encoding))
353 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
354 "[&#56448;]".encode(self.encoding))
355 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
356 "[]".encode(self.encoding))
357 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
358 "[?]".encode(self.encoding))
359
360 bom = "".encode(self.encoding)
361 for before, after in [("\U00010fff", "A"), ("[", "]"),
362 ("A", "\U00010fff")]:
363 before_sequence = before.encode(self.encoding)[len(bom):]
364 after_sequence = after.encode(self.encoding)[len(bom):]
365 test_string = before + "\uDC80" + after
366 test_sequence = (bom + before_sequence +
367 self.ill_formed_sequence + after_sequence)
368 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
369 self.encoding)
370 self.assertEqual(test_string.encode(self.encoding,
371 "surrogatepass"),
372 test_sequence)
373 self.assertEqual(test_sequence.decode(self.encoding,
374 "surrogatepass"),
375 test_string)
376 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
377 before + after)
378 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
379 before + self.ill_formed_sequence_replace + after)
380
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200381class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000382 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200383 if sys.byteorder == 'little':
384 ill_formed_sequence = b"\x80\xdc\x00\x00"
385 else:
386 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000387
388 spamle = (b'\xff\xfe\x00\x00'
389 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
390 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
391 spambe = (b'\x00\x00\xfe\xff'
392 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
393 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
394
395 def test_only_one_bom(self):
396 _,_,reader,writer = codecs.lookup(self.encoding)
397 # encode some stream
398 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200399 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000400 f.write("spam")
401 f.write("spam")
402 d = s.getvalue()
403 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000404 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000405 # try to read it back
406 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200407 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000408 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000409
410 def test_badbom(self):
411 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200412 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000413 self.assertRaises(UnicodeError, f.read)
414
415 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200416 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000417 self.assertRaises(UnicodeError, f.read)
418
419 def test_partial(self):
420 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200421 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000422 [
423 "", # first byte of BOM read
424 "", # second byte of BOM read
425 "", # third byte of BOM read
426 "", # fourth byte of BOM read => byteorder known
427 "",
428 "",
429 "",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff",
446 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 ]
448 )
449
Georg Brandl791f4e12009-09-17 11:41:24 +0000450 def test_handlers(self):
451 self.assertEqual(('\ufffd', 1),
452 codecs.utf_32_decode(b'\x01', 'replace', True))
453 self.assertEqual(('', 1),
454 codecs.utf_32_decode(b'\x01', 'ignore', True))
455
Walter Dörwald41980ca2007-08-16 21:55:45 +0000456 def test_errors(self):
457 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
458 b"\xff", "strict", True)
459
460 def test_decoder_state(self):
461 self.check_state_handling_decode(self.encoding,
462 "spamspam", self.spamle)
463 self.check_state_handling_decode(self.encoding,
464 "spamspam", self.spambe)
465
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000466 def test_issue8941(self):
467 # Issue #8941: insufficient result allocation when decoding into
468 # surrogate pairs on UCS-2 builds.
469 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
470 self.assertEqual('\U00010000' * 1024,
471 codecs.utf_32_decode(encoded_le)[0])
472 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
473 self.assertEqual('\U00010000' * 1024,
474 codecs.utf_32_decode(encoded_be)[0])
475
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200476class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000477 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200478 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000479
480 def test_partial(self):
481 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200482 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000483 [
484 "",
485 "",
486 "",
487 "\x00",
488 "\x00",
489 "\x00",
490 "\x00",
491 "\x00\xff",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff\u0100",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200500 "\x00\xff\u0100\uffff",
501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000504 ]
505 )
506
507 def test_simple(self):
508 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
509
510 def test_errors(self):
511 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
512 b"\xff", "strict", True)
513
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000514 def test_issue8941(self):
515 # Issue #8941: insufficient result allocation when decoding into
516 # surrogate pairs on UCS-2 builds.
517 encoded = b'\x00\x00\x01\x00' * 1024
518 self.assertEqual('\U00010000' * 1024,
519 codecs.utf_32_le_decode(encoded)[0])
520
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200521class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000522 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200523 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000524
525 def test_partial(self):
526 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200527 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000528 [
529 "",
530 "",
531 "",
532 "\x00",
533 "\x00",
534 "\x00",
535 "\x00",
536 "\x00\xff",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff",
540 "\x00\xff\u0100",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100",
544 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200545 "\x00\xff\u0100\uffff",
546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff",
548 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000549 ]
550 )
551
552 def test_simple(self):
553 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
554
555 def test_errors(self):
556 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
557 b"\xff", "strict", True)
558
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000559 def test_issue8941(self):
560 # Issue #8941: insufficient result allocation when decoding into
561 # surrogate pairs on UCS-2 builds.
562 encoded = b'\x00\x01\x00\x00' * 1024
563 self.assertEqual('\U00010000' * 1024,
564 codecs.utf_32_be_decode(encoded)[0])
565
566
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200567class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000568 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200569 if sys.byteorder == 'little':
570 ill_formed_sequence = b"\x80\xdc"
571 else:
572 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000573
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000574 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
575 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000576
577 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000578 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000579 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000580 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200581 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000582 f.write("spam")
583 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000584 d = s.getvalue()
585 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000586 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000587 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000588 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200589 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000590 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000591
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000592 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000593 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200594 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000595 self.assertRaises(UnicodeError, f.read)
596
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000597 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200598 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000599 self.assertRaises(UnicodeError, f.read)
600
Walter Dörwald69652032004-09-07 20:24:22 +0000601 def test_partial(self):
602 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200603 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000604 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000605 "", # first byte of BOM read
606 "", # second byte of BOM read => byteorder known
607 "",
608 "\x00",
609 "\x00",
610 "\x00\xff",
611 "\x00\xff",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000619 ]
620 )
621
Georg Brandl791f4e12009-09-17 11:41:24 +0000622 def test_handlers(self):
623 self.assertEqual(('\ufffd', 1),
624 codecs.utf_16_decode(b'\x01', 'replace', True))
625 self.assertEqual(('', 1),
626 codecs.utf_16_decode(b'\x01', 'ignore', True))
627
Walter Dörwalde22d3392005-11-17 08:52:34 +0000628 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000629 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000630 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000631
632 def test_decoder_state(self):
633 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000634 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000635 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000636 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000637
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000638 def test_bug691291(self):
639 # Files are always opened in binary mode, even if no binary mode was
640 # specified. This means that no automatic conversion of '\n' is done
641 # on reading and writing.
642 s1 = 'Hello\r\nworld\r\n'
643
644 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200645 self.addCleanup(support.unlink, support.TESTFN)
646 with open(support.TESTFN, 'wb') as fp:
647 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200648 with support.check_warnings(('', DeprecationWarning)):
649 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
650 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200651 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000652
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200653class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000654 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200655 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000656
657 def test_partial(self):
658 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200659 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000660 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000661 "",
662 "\x00",
663 "\x00",
664 "\x00\xff",
665 "\x00\xff",
666 "\x00\xff\u0100",
667 "\x00\xff\u0100",
668 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff",
672 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000673 ]
674 )
675
Walter Dörwalde22d3392005-11-17 08:52:34 +0000676 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200677 tests = [
678 (b'\xff', '\ufffd'),
679 (b'A\x00Z', 'A\ufffd'),
680 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
681 (b'\x00\xd8', '\ufffd'),
682 (b'\x00\xd8A', '\ufffd'),
683 (b'\x00\xd8A\x00', '\ufffdA'),
684 (b'\x00\xdcA\x00', '\ufffdA'),
685 ]
686 for raw, expected in tests:
687 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
688 raw, 'strict', True)
689 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000690
Victor Stinner53a9dd72010-12-08 22:25:45 +0000691 def test_nonbmp(self):
692 self.assertEqual("\U00010203".encode(self.encoding),
693 b'\x00\xd8\x03\xde')
694 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
695 "\U00010203")
696
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200697class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000698 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200699 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000700
701 def test_partial(self):
702 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200703 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000704 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000705 "",
706 "\x00",
707 "\x00",
708 "\x00\xff",
709 "\x00\xff",
710 "\x00\xff\u0100",
711 "\x00\xff\u0100",
712 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200713 "\x00\xff\u0100\uffff",
714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff",
716 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000717 ]
718 )
719
Walter Dörwalde22d3392005-11-17 08:52:34 +0000720 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200721 tests = [
722 (b'\xff', '\ufffd'),
723 (b'\x00A\xff', 'A\ufffd'),
724 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
725 (b'\xd8\x00', '\ufffd'),
726 (b'\xd8\x00\xdc', '\ufffd'),
727 (b'\xd8\x00\x00A', '\ufffdA'),
728 (b'\xdc\x00\x00A', '\ufffdA'),
729 ]
730 for raw, expected in tests:
731 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
732 raw, 'strict', True)
733 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000734
Victor Stinner53a9dd72010-12-08 22:25:45 +0000735 def test_nonbmp(self):
736 self.assertEqual("\U00010203".encode(self.encoding),
737 b'\xd8\x00\xde\x03')
738 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
739 "\U00010203")
740
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200741class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000742 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200743 ill_formed_sequence = b"\xed\xb2\x80"
744 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000745
746 def test_partial(self):
747 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200748 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000749 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000750 "\x00",
751 "\x00",
752 "\x00\xff",
753 "\x00\xff",
754 "\x00\xff\u07ff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff",
757 "\x00\xff\u07ff\u0800",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800",
760 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200761 "\x00\xff\u07ff\u0800\uffff",
762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff",
764 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000765 ]
766 )
767
Walter Dörwald3abcb012007-04-16 22:10:50 +0000768 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000769 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000770 self.check_state_handling_decode(self.encoding,
771 u, u.encode(self.encoding))
772
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000773 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200774 super().test_lone_surrogates()
775 # not sure if this is making sense for
776 # UTF-16 and UTF-32
777 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000778 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000779
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000780 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000781 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
782 b"abc\xed\xa0\x80def")
783 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
784 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200785 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
786 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
787 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
788 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000789 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700790 with self.assertRaises(UnicodeDecodeError):
791 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200792 with self.assertRaises(UnicodeDecodeError):
793 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000794
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200795@unittest.skipUnless(sys.platform == 'win32',
796 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200797class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200798 encoding = "cp65001"
799
800 def test_encode(self):
801 tests = [
802 ('abc', 'strict', b'abc'),
803 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
804 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
805 ]
806 if VISTA_OR_LATER:
807 tests.extend((
808 ('\udc80', 'strict', None),
809 ('\udc80', 'ignore', b''),
810 ('\udc80', 'replace', b'?'),
811 ('\udc80', 'backslashreplace', b'\\udc80'),
812 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
813 ))
814 else:
815 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
816 for text, errors, expected in tests:
817 if expected is not None:
818 try:
819 encoded = text.encode('cp65001', errors)
820 except UnicodeEncodeError as err:
821 self.fail('Unable to encode %a to cp65001 with '
822 'errors=%r: %s' % (text, errors, err))
823 self.assertEqual(encoded, expected,
824 '%a.encode("cp65001", %r)=%a != %a'
825 % (text, errors, encoded, expected))
826 else:
827 self.assertRaises(UnicodeEncodeError,
828 text.encode, "cp65001", errors)
829
830 def test_decode(self):
831 tests = [
832 (b'abc', 'strict', 'abc'),
833 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
834 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
835 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
836 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
837 # invalid bytes
838 (b'[\xff]', 'strict', None),
839 (b'[\xff]', 'ignore', '[]'),
840 (b'[\xff]', 'replace', '[\ufffd]'),
841 (b'[\xff]', 'surrogateescape', '[\udcff]'),
842 ]
843 if VISTA_OR_LATER:
844 tests.extend((
845 (b'[\xed\xb2\x80]', 'strict', None),
846 (b'[\xed\xb2\x80]', 'ignore', '[]'),
847 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
848 ))
849 else:
850 tests.extend((
851 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
852 ))
853 for raw, errors, expected in tests:
854 if expected is not None:
855 try:
856 decoded = raw.decode('cp65001', errors)
857 except UnicodeDecodeError as err:
858 self.fail('Unable to decode %a from cp65001 with '
859 'errors=%r: %s' % (raw, errors, err))
860 self.assertEqual(decoded, expected,
861 '%a.decode("cp65001", %r)=%a != %a'
862 % (raw, errors, decoded, expected))
863 else:
864 self.assertRaises(UnicodeDecodeError,
865 raw.decode, 'cp65001', errors)
866
867 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
868 def test_lone_surrogates(self):
869 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
870 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
871 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
872 b'[\\udc80]')
873 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
874 b'[&#56448;]')
875 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
876 b'[\x80]')
877 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
878 b'[]')
879 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
880 b'[?]')
881
882 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
883 def test_surrogatepass_handler(self):
884 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
885 b"abc\xed\xa0\x80def")
886 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
887 "abc\ud800def")
888 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
889 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
890 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
891 "\U00010fff\uD800")
892 self.assertTrue(codecs.lookup_error("surrogatepass"))
893
894
895
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200896class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000897 encoding = "utf-7"
898
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000899 def test_partial(self):
900 self.check_partial(
901 "a+-b",
902 [
903 "a",
904 "a",
905 "a+",
906 "a+-",
907 "a+-b",
908 ]
909 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000910
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300911 def test_errors(self):
912 tests = [
913 (b'a\xffb', 'a\ufffdb'),
914 (b'a+IK', 'a\ufffd'),
915 (b'a+IK-b', 'a\ufffdb'),
916 (b'a+IK,b', 'a\ufffdb'),
917 (b'a+IKx', 'a\u20ac\ufffd'),
918 (b'a+IKx-b', 'a\u20ac\ufffdb'),
919 (b'a+IKwgr', 'a\u20ac\ufffd'),
920 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
921 (b'a+IKwgr,', 'a\u20ac\ufffd'),
922 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
923 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
924 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
925 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
926 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
927 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
928 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
929 ]
930 for raw, expected in tests:
931 with self.subTest(raw=raw):
932 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
933 raw, 'strict', True)
934 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
935
936 def test_nonbmp(self):
937 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
938 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
939 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
940
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200941 test_lone_surrogates = None
942
943
Walter Dörwalde22d3392005-11-17 08:52:34 +0000944class UTF16ExTest(unittest.TestCase):
945
946 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000947 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000948
949 def test_bad_args(self):
950 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
951
952class ReadBufferTest(unittest.TestCase):
953
954 def test_array(self):
955 import array
956 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000957 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000958 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000959 )
960
961 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000962 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000963
964 def test_bad_args(self):
965 self.assertRaises(TypeError, codecs.readbuffer_encode)
966 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
967
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200968class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000969 encoding = "utf-8-sig"
970
971 def test_partial(self):
972 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200973 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000974 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000975 "",
976 "",
977 "", # First BOM has been read and skipped
978 "",
979 "",
980 "\ufeff", # Second BOM has been read and emitted
981 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000982 "\ufeff\x00", # First byte of encoded "\xff" read
983 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
984 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
985 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000986 "\ufeff\x00\xff\u07ff",
987 "\ufeff\x00\xff\u07ff",
988 "\ufeff\x00\xff\u07ff\u0800",
989 "\ufeff\x00\xff\u07ff\u0800",
990 "\ufeff\x00\xff\u07ff\u0800",
991 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200992 "\ufeff\x00\xff\u07ff\u0800\uffff",
993 "\ufeff\x00\xff\u07ff\u0800\uffff",
994 "\ufeff\x00\xff\u07ff\u0800\uffff",
995 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000996 ]
997 )
998
Thomas Wouters89f507f2006-12-13 04:49:30 +0000999 def test_bug1601501(self):
1000 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001001 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001002
Walter Dörwald3abcb012007-04-16 22:10:50 +00001003 def test_bom(self):
1004 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001006 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1007
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001008 def test_stream_bom(self):
1009 unistring = "ABC\u00A1\u2200XYZ"
1010 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1011
1012 reader = codecs.getreader("utf-8-sig")
1013 for sizehint in [None] + list(range(1, 11)) + \
1014 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001015 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001016 ostream = io.StringIO()
1017 while 1:
1018 if sizehint is not None:
1019 data = istream.read(sizehint)
1020 else:
1021 data = istream.read()
1022
1023 if not data:
1024 break
1025 ostream.write(data)
1026
1027 got = ostream.getvalue()
1028 self.assertEqual(got, unistring)
1029
1030 def test_stream_bare(self):
1031 unistring = "ABC\u00A1\u2200XYZ"
1032 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1033
1034 reader = codecs.getreader("utf-8-sig")
1035 for sizehint in [None] + list(range(1, 11)) + \
1036 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001037 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001038 ostream = io.StringIO()
1039 while 1:
1040 if sizehint is not None:
1041 data = istream.read(sizehint)
1042 else:
1043 data = istream.read()
1044
1045 if not data:
1046 break
1047 ostream.write(data)
1048
1049 got = ostream.getvalue()
1050 self.assertEqual(got, unistring)
1051
1052class EscapeDecodeTest(unittest.TestCase):
1053 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001054 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001055
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001056 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001057 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001058 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001059 b = bytes([b])
1060 if b != b'\\':
1061 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001062
1063 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001064 decode = codecs.escape_decode
1065 check = coding_checker(self, decode)
1066 check(b"[\\\n]", b"[]")
1067 check(br'[\"]', b'["]')
1068 check(br"[\']", b"[']")
1069 check(br"[\\]", br"[\]")
1070 check(br"[\a]", b"[\x07]")
1071 check(br"[\b]", b"[\x08]")
1072 check(br"[\t]", b"[\x09]")
1073 check(br"[\n]", b"[\x0a]")
1074 check(br"[\v]", b"[\x0b]")
1075 check(br"[\f]", b"[\x0c]")
1076 check(br"[\r]", b"[\x0d]")
1077 check(br"[\7]", b"[\x07]")
1078 check(br"[\8]", br"[\8]")
1079 check(br"[\78]", b"[\x078]")
1080 check(br"[\41]", b"[!]")
1081 check(br"[\418]", b"[!8]")
1082 check(br"[\101]", b"[A]")
1083 check(br"[\1010]", b"[A0]")
1084 check(br"[\501]", b"[A]")
1085 check(br"[\x41]", b"[A]")
1086 check(br"[\X41]", br"[\X41]")
1087 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001088 for b in range(256):
1089 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001090 b = bytes([b])
1091 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001092
1093 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001094 decode = codecs.escape_decode
1095 self.assertRaises(ValueError, decode, br"\x")
1096 self.assertRaises(ValueError, decode, br"[\x]")
1097 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1098 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1099 self.assertRaises(ValueError, decode, br"\x0")
1100 self.assertRaises(ValueError, decode, br"[\x0]")
1101 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1102 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001103
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001104class RecodingTest(unittest.TestCase):
1105 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001106 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001107 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001108 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001109 f2.close()
1110 # Python used to crash on this at exit because of a refcount
1111 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001112
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113# From RFC 3492
1114punycode_testcases = [
1115 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001116 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1117 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001118 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001120 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001121 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001123 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001124 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001126 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1127 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1128 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001129 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001130 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001131 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1132 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1133 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001134 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001136 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001137 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1138 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1139 "\u0939\u0948\u0902",
1140 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141
1142 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001143 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001144 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1145 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001146
1147 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001148 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1149 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1150 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001151 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1152 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001153
1154 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001155 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1156 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1157 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1158 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001159 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001160
1161 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1163 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1164 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1165 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1166 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001167 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001168
1169 # (K) Vietnamese:
1170 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1171 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1173 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1174 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1175 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001176 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001177
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001179 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001180 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001181
Martin v. Löwis2548c732003-04-18 10:39:54 +00001182 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001183 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1184 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1185 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001186 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187
1188 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001189 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1190 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1191 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001192 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193
1194 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001195 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001196 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197
1198 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001199 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1200 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001201 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001202
1203 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001204 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001205 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001206
1207 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001208 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001209 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210
1211 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001212 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1213 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001214 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 ]
1216
1217for i in punycode_testcases:
1218 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001219 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001220
1221class PunycodeTest(unittest.TestCase):
1222 def test_encode(self):
1223 for uni, puny in punycode_testcases:
1224 # Need to convert both strings to lower case, since
1225 # some of the extended encodings use upper case, but our
1226 # code produces only lower case. Converting just puny to
1227 # lower is also insufficient, since some of the input characters
1228 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001229 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001230 str(uni.encode("punycode"), "ascii").lower(),
1231 str(puny, "ascii").lower()
1232 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233
1234 def test_decode(self):
1235 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001236 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001237 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001238 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001240class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001241 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001242 def test_bug1251300(self):
1243 # Decoding with unicode_internal used to not correctly handle "code
1244 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001245 ok = [
1246 (b"\x00\x10\xff\xff", "\U0010ffff"),
1247 (b"\x00\x00\x01\x01", "\U00000101"),
1248 (b"", ""),
1249 ]
1250 not_ok = [
1251 b"\x7f\xff\xff\xff",
1252 b"\x80\x00\x00\x00",
1253 b"\x81\x00\x00\x00",
1254 b"\x00",
1255 b"\x00\x00\x00\x00\x00",
1256 ]
1257 for internal, uni in ok:
1258 if sys.byteorder == "little":
1259 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001260 with support.check_warnings():
1261 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001262 for internal in not_ok:
1263 if sys.byteorder == "little":
1264 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001265 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001266 'deprecated', DeprecationWarning)):
1267 self.assertRaises(UnicodeDecodeError, internal.decode,
1268 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001269 if sys.byteorder == "little":
1270 invalid = b"\x00\x00\x11\x00"
1271 else:
1272 invalid = b"\x00\x11\x00\x00"
1273 with support.check_warnings():
1274 self.assertRaises(UnicodeDecodeError,
1275 invalid.decode, "unicode_internal")
1276 with support.check_warnings():
1277 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1278 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001279
Victor Stinner182d90d2011-09-29 19:53:55 +02001280 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001281 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001282 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001283 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001284 'deprecated', DeprecationWarning)):
1285 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001286 except UnicodeDecodeError as ex:
1287 self.assertEqual("unicode_internal", ex.encoding)
1288 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1289 self.assertEqual(4, ex.start)
1290 self.assertEqual(8, ex.end)
1291 else:
1292 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001293
Victor Stinner182d90d2011-09-29 19:53:55 +02001294 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001295 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001296 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1297 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001298 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001299 'deprecated', DeprecationWarning)):
1300 ab = "ab".encode("unicode_internal").decode()
1301 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1302 "ascii"),
1303 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001304 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001305
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001306 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001307 with support.check_warnings(('unicode_internal codec has been '
1308 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001309 # Issue 3739
1310 encoder = codecs.getencoder("unicode_internal")
1311 self.assertEqual(encoder("a")[1], 1)
1312 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1313
1314 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001315
Martin v. Löwis2548c732003-04-18 10:39:54 +00001316# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1317nameprep_tests = [
1318 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1320 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1321 b'\xb8\x8f\xef\xbb\xbf',
1322 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001323 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001324 (b'CAFE',
1325 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326 # 3.3 Case folding 8bit U+00DF (german sharp s).
1327 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001328 (b'\xc3\x9f',
1329 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001330 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001331 (b'\xc4\xb0',
1332 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001334 (b'\xc5\x83\xcd\xba',
1335 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1337 # XXX: skip this as it fails in UCS-2 mode
1338 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1339 # 'telc\xe2\x88\x95kg\xcf\x83'),
1340 (None, None),
1341 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001342 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1343 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001344 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001345 (b'\xe1\xbe\xb7',
1346 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347 # 3.9 Self-reverting case folding U+01F0 and normalization.
1348 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001349 (b'\xc7\xb0',
1350 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001351 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001352 (b'\xce\x90',
1353 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001354 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001355 (b'\xce\xb0',
1356 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001357 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001358 (b'\xe1\xba\x96',
1359 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'\xe1\xbd\x96',
1362 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001364 (b' ',
1365 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001367 (b'\xc2\xa0',
1368 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001369 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001371 None),
1372 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'\xe2\x80\x80',
1374 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xe2\x80\x8b',
1377 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001379 (b'\xe3\x80\x80',
1380 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001381 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001382 (b'\x10\x7f',
1383 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001384 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001385 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 None),
1387 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001388 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 None),
1390 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001391 (b'\xef\xbb\xbf',
1392 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 None),
1396 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 None),
1399 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 None),
1402 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001403 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 None),
1405 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001406 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 None),
1408 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 None),
1411 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001412 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001413 None),
1414 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001415 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 None),
1417 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001418 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001419 None),
1420 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001421 (b'\xcd\x81',
1422 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001424 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 None),
1426 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001427 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001428 None),
1429 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001430 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001431 None),
1432 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001433 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001434 None),
1435 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001436 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 None),
1438 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001439 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001440 None),
1441 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001442 (b'foo\xef\xb9\xb6bar',
1443 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001445 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001446 None),
1447 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001448 (b'\xd8\xa71\xd8\xa8',
1449 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001450 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001451 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001452 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001453 # None),
1454 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 # 3.44 Larger test (shrinking).
1456 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1458 b'\xaa\xce\xb0\xe2\x80\x80',
1459 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001460 # 3.45 Larger test (expanding).
1461 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001462 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1463 b'\x80',
1464 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1465 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1466 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 ]
1468
1469
1470class NameprepTest(unittest.TestCase):
1471 def test_nameprep(self):
1472 from encodings.idna import nameprep
1473 for pos, (orig, prepped) in enumerate(nameprep_tests):
1474 if orig is None:
1475 # Skipped
1476 continue
1477 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001478 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001479 if prepped is None:
1480 # Input contains prohibited characters
1481 self.assertRaises(UnicodeError, nameprep, orig)
1482 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001483 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001484 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001485 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001486 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001487 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001488
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001489class IDNACodecTest(unittest.TestCase):
1490 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001491 self.assertEqual(str(b"python.org", "idna"), "python.org")
1492 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1493 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1494 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001495
1496 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001497 self.assertEqual("python.org".encode("idna"), b"python.org")
1498 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1499 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1500 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001501
Martin v. Löwis8b595142005-08-25 11:03:38 +00001502 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001503 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001504 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001505 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001506
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001507 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001508 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001509 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001510 "python.org"
1511 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001512 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001513 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001514 "python.org."
1515 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001516 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001517 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 "pyth\xf6n.org."
1519 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001520 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001521 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001522 "pyth\xf6n.org."
1523 )
1524
1525 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001526 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1527 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1528 self.assertEqual(decoder.decode(b"rg"), "")
1529 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001530
1531 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001532 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1533 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1534 self.assertEqual(decoder.decode(b"rg."), "org.")
1535 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536
1537 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001539 b"".join(codecs.iterencode("python.org", "idna")),
1540 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001541 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001542 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001543 b"".join(codecs.iterencode("python.org.", "idna")),
1544 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001545 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001547 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1548 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001549 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001551 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1552 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001553 )
1554
1555 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001556 self.assertEqual(encoder.encode("\xe4x"), b"")
1557 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1558 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001559
1560 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001561 self.assertEqual(encoder.encode("\xe4x"), b"")
1562 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1563 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001564
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001565class CodecsModuleTest(unittest.TestCase):
1566
1567 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001568 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1569 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001570 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001571 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001572 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001573
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001574 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001575 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1576 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001577 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001578 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001579 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001580 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001581
1582 def test_register(self):
1583 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001584 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001585
1586 def test_lookup(self):
1587 self.assertRaises(TypeError, codecs.lookup)
1588 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001589 self.assertRaises(LookupError, codecs.lookup, " ")
1590
1591 def test_getencoder(self):
1592 self.assertRaises(TypeError, codecs.getencoder)
1593 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1594
1595 def test_getdecoder(self):
1596 self.assertRaises(TypeError, codecs.getdecoder)
1597 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1598
1599 def test_getreader(self):
1600 self.assertRaises(TypeError, codecs.getreader)
1601 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1602
1603 def test_getwriter(self):
1604 self.assertRaises(TypeError, codecs.getwriter)
1605 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001606
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001607 def test_lookup_issue1813(self):
1608 # Issue #1813: under Turkish locales, lookup of some codecs failed
1609 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001610 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001611 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1612 try:
1613 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1614 except locale.Error:
1615 # Unsupported locale on this system
1616 self.skipTest('test needs Turkish locale')
1617 c = codecs.lookup('ASCII')
1618 self.assertEqual(c.name, 'ascii')
1619
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001620class StreamReaderTest(unittest.TestCase):
1621
1622 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001623 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001624 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001625
1626 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001627 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001628 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001629
Thomas Wouters89f507f2006-12-13 04:49:30 +00001630class EncodedFileTest(unittest.TestCase):
1631
1632 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001633 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001634 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001635 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001636
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001637 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001638 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001639 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001640 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001641
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001642all_unicode_encodings = [
1643 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001644 "big5",
1645 "big5hkscs",
1646 "charmap",
1647 "cp037",
1648 "cp1006",
1649 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001650 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001651 "cp1140",
1652 "cp1250",
1653 "cp1251",
1654 "cp1252",
1655 "cp1253",
1656 "cp1254",
1657 "cp1255",
1658 "cp1256",
1659 "cp1257",
1660 "cp1258",
1661 "cp424",
1662 "cp437",
1663 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001664 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001665 "cp737",
1666 "cp775",
1667 "cp850",
1668 "cp852",
1669 "cp855",
1670 "cp856",
1671 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001672 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001673 "cp860",
1674 "cp861",
1675 "cp862",
1676 "cp863",
1677 "cp864",
1678 "cp865",
1679 "cp866",
1680 "cp869",
1681 "cp874",
1682 "cp875",
1683 "cp932",
1684 "cp949",
1685 "cp950",
1686 "euc_jis_2004",
1687 "euc_jisx0213",
1688 "euc_jp",
1689 "euc_kr",
1690 "gb18030",
1691 "gb2312",
1692 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001693 "hp_roman8",
1694 "hz",
1695 "idna",
1696 "iso2022_jp",
1697 "iso2022_jp_1",
1698 "iso2022_jp_2",
1699 "iso2022_jp_2004",
1700 "iso2022_jp_3",
1701 "iso2022_jp_ext",
1702 "iso2022_kr",
1703 "iso8859_1",
1704 "iso8859_10",
1705 "iso8859_11",
1706 "iso8859_13",
1707 "iso8859_14",
1708 "iso8859_15",
1709 "iso8859_16",
1710 "iso8859_2",
1711 "iso8859_3",
1712 "iso8859_4",
1713 "iso8859_5",
1714 "iso8859_6",
1715 "iso8859_7",
1716 "iso8859_8",
1717 "iso8859_9",
1718 "johab",
1719 "koi8_r",
1720 "koi8_u",
1721 "latin_1",
1722 "mac_cyrillic",
1723 "mac_greek",
1724 "mac_iceland",
1725 "mac_latin2",
1726 "mac_roman",
1727 "mac_turkish",
1728 "palmos",
1729 "ptcp154",
1730 "punycode",
1731 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001732 "shift_jis",
1733 "shift_jis_2004",
1734 "shift_jisx0213",
1735 "tis_620",
1736 "unicode_escape",
1737 "unicode_internal",
1738 "utf_16",
1739 "utf_16_be",
1740 "utf_16_le",
1741 "utf_7",
1742 "utf_8",
1743]
1744
1745if hasattr(codecs, "mbcs_encode"):
1746 all_unicode_encodings.append("mbcs")
1747
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001748# The following encoding is not tested, because it's not supposed
1749# to work:
1750# "undefined"
1751
1752# The following encodings don't work in stateful mode
1753broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001754 "punycode",
1755 "unicode_internal"
1756]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001757broken_incremental_coders = broken_unicode_with_streams + [
1758 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001759]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001760
Walter Dörwald3abcb012007-04-16 22:10:50 +00001761class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001762 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001763 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001764 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001765 name = codecs.lookup(encoding).name
1766 if encoding.endswith("_codec"):
1767 name += "_codec"
1768 elif encoding == "latin_1":
1769 name = "latin_1"
1770 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001771
Ezio Melottiadc417c2011-11-17 12:23:34 +02001772 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001773 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001774 (b, size) = codecs.getencoder(encoding)(s)
1775 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1776 (chars, size) = codecs.getdecoder(encoding)(b)
1777 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001778
1779 if encoding not in broken_unicode_with_streams:
1780 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001781 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001782 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001783 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001784 for c in s:
1785 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001786 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001787 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001788 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001789 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001790 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001791 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001792 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001793 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001794 decodedresult += reader.read()
1795 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1796
Thomas Wouters89f507f2006-12-13 04:49:30 +00001797 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001798 # check incremental decoder/encoder (fetched via the Python
1799 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001800 try:
1801 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001802 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001803 except LookupError: # no IncrementalEncoder
1804 pass
1805 else:
1806 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001807 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001808 for c in s:
1809 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001810 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001811 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001812 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001813 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001814 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001815 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001816 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1817
1818 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001819 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001820 for c in s:
1821 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001822 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001823 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001824 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001825 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001826 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001827 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001828 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1829
1830 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001831 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001832 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1833
1834 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001835 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1836 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001837
Victor Stinner554f3f02010-06-16 23:33:54 +00001838 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001839 # check incremental decoder/encoder with errors argument
1840 try:
1841 encoder = codecs.getincrementalencoder(encoding)("ignore")
1842 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1843 except LookupError: # no IncrementalEncoder
1844 pass
1845 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001846 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001847 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001848 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001849 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1850
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001851 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001852 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001853 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001854 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1855
Walter Dörwald729c31f2005-03-14 19:06:30 +00001856 def test_seek(self):
1857 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001858 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001859 for encoding in all_unicode_encodings:
1860 if encoding == "idna": # FIXME: See SF bug #1163178
1861 continue
1862 if encoding in broken_unicode_with_streams:
1863 continue
Victor Stinner05010702011-05-27 16:50:40 +02001864 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001865 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001866 # Test that calling seek resets the internal codec state and buffers
1867 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001868 data = reader.read()
1869 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001870
Walter Dörwalde22d3392005-11-17 08:52:34 +00001871 def test_bad_decode_args(self):
1872 for encoding in all_unicode_encodings:
1873 decoder = codecs.getdecoder(encoding)
1874 self.assertRaises(TypeError, decoder)
1875 if encoding not in ("idna", "punycode"):
1876 self.assertRaises(TypeError, decoder, 42)
1877
1878 def test_bad_encode_args(self):
1879 for encoding in all_unicode_encodings:
1880 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001881 with support.check_warnings():
1882 # unicode-internal has been deprecated
1883 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001884
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001885 def test_encoding_map_type_initialized(self):
1886 from encodings import cp1140
1887 # This used to crash, we are only verifying there's no crash.
1888 table_type = type(cp1140.encoding_table)
1889 self.assertEqual(table_type, table_type)
1890
Walter Dörwald3abcb012007-04-16 22:10:50 +00001891 def test_decoder_state(self):
1892 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001893 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001894 for encoding in all_unicode_encodings:
1895 if encoding not in broken_incremental_coders:
1896 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1897 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1898
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001899class CharmapTest(unittest.TestCase):
1900 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001901 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001902 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001903 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001904 )
1905
Ezio Melottib3aedd42010-11-20 19:04:17 +00001906 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001907 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1908 ("\U0010FFFFbc", 3)
1909 )
1910
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001911 self.assertRaises(UnicodeDecodeError,
1912 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1913 )
1914
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001915 self.assertRaises(UnicodeDecodeError,
1916 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1917 )
1918
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001919 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001920 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001921 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001922 )
1923
Ezio Melottib3aedd42010-11-20 19:04:17 +00001924 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001925 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001926 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001927 )
1928
Ezio Melottib3aedd42010-11-20 19:04:17 +00001929 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001930 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001931 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001932 )
1933
Ezio Melottib3aedd42010-11-20 19:04:17 +00001934 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001935 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001936 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001937 )
1938
Guido van Rossum805365e2007-05-07 22:24:25 +00001939 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001940 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001941 codecs.charmap_decode(allbytes, "ignore", ""),
1942 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001943 )
1944
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001945 def test_decode_with_int2str_map(self):
1946 self.assertEqual(
1947 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1948 {0: 'a', 1: 'b', 2: 'c'}),
1949 ("abc", 3)
1950 )
1951
1952 self.assertEqual(
1953 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1954 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1955 ("AaBbCc", 3)
1956 )
1957
1958 self.assertEqual(
1959 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1960 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1961 ("\U0010FFFFbc", 3)
1962 )
1963
1964 self.assertEqual(
1965 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1966 {0: 'a', 1: 'b', 2: ''}),
1967 ("ab", 3)
1968 )
1969
1970 self.assertRaises(UnicodeDecodeError,
1971 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1972 {0: 'a', 1: 'b'}
1973 )
1974
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001975 self.assertRaises(UnicodeDecodeError,
1976 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1977 {0: 'a', 1: 'b', 2: None}
1978 )
1979
1980 # Issue #14850
1981 self.assertRaises(UnicodeDecodeError,
1982 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1983 {0: 'a', 1: 'b', 2: '\ufffe'}
1984 )
1985
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001986 self.assertEqual(
1987 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1988 {0: 'a', 1: 'b'}),
1989 ("ab\ufffd", 3)
1990 )
1991
1992 self.assertEqual(
1993 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1994 {0: 'a', 1: 'b', 2: None}),
1995 ("ab\ufffd", 3)
1996 )
1997
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001998 # Issue #14850
1999 self.assertEqual(
2000 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2001 {0: 'a', 1: 'b', 2: '\ufffe'}),
2002 ("ab\ufffd", 3)
2003 )
2004
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002005 self.assertEqual(
2006 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2007 {0: 'a', 1: 'b'}),
2008 ("ab", 3)
2009 )
2010
2011 self.assertEqual(
2012 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2013 {0: 'a', 1: 'b', 2: None}),
2014 ("ab", 3)
2015 )
2016
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002017 # Issue #14850
2018 self.assertEqual(
2019 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2020 {0: 'a', 1: 'b', 2: '\ufffe'}),
2021 ("ab", 3)
2022 )
2023
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002024 allbytes = bytes(range(256))
2025 self.assertEqual(
2026 codecs.charmap_decode(allbytes, "ignore", {}),
2027 ("", len(allbytes))
2028 )
2029
2030 def test_decode_with_int2int_map(self):
2031 a = ord('a')
2032 b = ord('b')
2033 c = ord('c')
2034
2035 self.assertEqual(
2036 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2037 {0: a, 1: b, 2: c}),
2038 ("abc", 3)
2039 )
2040
2041 # Issue #15379
2042 self.assertEqual(
2043 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2044 {0: 0x10FFFF, 1: b, 2: c}),
2045 ("\U0010FFFFbc", 3)
2046 )
2047
Antoine Pitroua1f76552012-09-23 20:00:04 +02002048 self.assertEqual(
2049 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2050 {0: sys.maxunicode, 1: b, 2: c}),
2051 (chr(sys.maxunicode) + "bc", 3)
2052 )
2053
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002054 self.assertRaises(TypeError,
2055 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002056 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002057 )
2058
2059 self.assertRaises(UnicodeDecodeError,
2060 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2061 {0: a, 1: b},
2062 )
2063
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002064 self.assertRaises(UnicodeDecodeError,
2065 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2066 {0: a, 1: b, 2: 0xFFFE},
2067 )
2068
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002069 self.assertEqual(
2070 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2071 {0: a, 1: b}),
2072 ("ab\ufffd", 3)
2073 )
2074
2075 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002076 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2077 {0: a, 1: b, 2: 0xFFFE}),
2078 ("ab\ufffd", 3)
2079 )
2080
2081 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002082 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2083 {0: a, 1: b}),
2084 ("ab", 3)
2085 )
2086
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002087 self.assertEqual(
2088 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2089 {0: a, 1: b, 2: 0xFFFE}),
2090 ("ab", 3)
2091 )
2092
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002093
Thomas Wouters89f507f2006-12-13 04:49:30 +00002094class WithStmtTest(unittest.TestCase):
2095 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002096 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002097 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2098 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002099
2100 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002101 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002102 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002103 with codecs.StreamReaderWriter(f, info.streamreader,
2104 info.streamwriter, 'strict') as srw:
2105 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002106
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002107class TypesTest(unittest.TestCase):
2108 def test_decode_unicode(self):
2109 # Most decoders don't accept unicode input
2110 decoders = [
2111 codecs.utf_7_decode,
2112 codecs.utf_8_decode,
2113 codecs.utf_16_le_decode,
2114 codecs.utf_16_be_decode,
2115 codecs.utf_16_ex_decode,
2116 codecs.utf_32_decode,
2117 codecs.utf_32_le_decode,
2118 codecs.utf_32_be_decode,
2119 codecs.utf_32_ex_decode,
2120 codecs.latin_1_decode,
2121 codecs.ascii_decode,
2122 codecs.charmap_decode,
2123 ]
2124 if hasattr(codecs, "mbcs_decode"):
2125 decoders.append(codecs.mbcs_decode)
2126 for decoder in decoders:
2127 self.assertRaises(TypeError, decoder, "xxx")
2128
2129 def test_unicode_escape(self):
2130 # Escape-decoding an unicode string is supported ang gives the same
2131 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002132 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2133 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2134 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2135 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002136
Victor Stinnere3b47152011-12-09 20:49:49 +01002137 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2138 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2139
2140 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2141 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2142
Serhiy Storchakad6793772013-01-29 10:20:44 +02002143
2144class UnicodeEscapeTest(unittest.TestCase):
2145 def test_empty(self):
2146 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2147 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2148
2149 def test_raw_encode(self):
2150 encode = codecs.unicode_escape_encode
2151 for b in range(32, 127):
2152 if b != b'\\'[0]:
2153 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2154
2155 def test_raw_decode(self):
2156 decode = codecs.unicode_escape_decode
2157 for b in range(256):
2158 if b != b'\\'[0]:
2159 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2160
2161 def test_escape_encode(self):
2162 encode = codecs.unicode_escape_encode
2163 check = coding_checker(self, encode)
2164 check('\t', br'\t')
2165 check('\n', br'\n')
2166 check('\r', br'\r')
2167 check('\\', br'\\')
2168 for b in range(32):
2169 if chr(b) not in '\t\n\r':
2170 check(chr(b), ('\\x%02x' % b).encode())
2171 for b in range(127, 256):
2172 check(chr(b), ('\\x%02x' % b).encode())
2173 check('\u20ac', br'\u20ac')
2174 check('\U0001d120', br'\U0001d120')
2175
2176 def test_escape_decode(self):
2177 decode = codecs.unicode_escape_decode
2178 check = coding_checker(self, decode)
2179 check(b"[\\\n]", "[]")
2180 check(br'[\"]', '["]')
2181 check(br"[\']", "[']")
2182 check(br"[\\]", r"[\]")
2183 check(br"[\a]", "[\x07]")
2184 check(br"[\b]", "[\x08]")
2185 check(br"[\t]", "[\x09]")
2186 check(br"[\n]", "[\x0a]")
2187 check(br"[\v]", "[\x0b]")
2188 check(br"[\f]", "[\x0c]")
2189 check(br"[\r]", "[\x0d]")
2190 check(br"[\7]", "[\x07]")
2191 check(br"[\8]", r"[\8]")
2192 check(br"[\78]", "[\x078]")
2193 check(br"[\41]", "[!]")
2194 check(br"[\418]", "[!8]")
2195 check(br"[\101]", "[A]")
2196 check(br"[\1010]", "[A0]")
2197 check(br"[\x41]", "[A]")
2198 check(br"[\x410]", "[A0]")
2199 check(br"\u20ac", "\u20ac")
2200 check(br"\U0001d120", "\U0001d120")
2201 for b in range(256):
2202 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2203 check(b'\\' + bytes([b]), '\\' + chr(b))
2204
2205 def test_decode_errors(self):
2206 decode = codecs.unicode_escape_decode
2207 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2208 for i in range(d):
2209 self.assertRaises(UnicodeDecodeError, decode,
2210 b"\\" + c + b"0"*i)
2211 self.assertRaises(UnicodeDecodeError, decode,
2212 b"[\\" + c + b"0"*i + b"]")
2213 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2214 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2215 self.assertEqual(decode(data, "replace"),
2216 ("[\ufffd]\ufffd", len(data)))
2217 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2218 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2219 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2220
2221
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002222class RawUnicodeEscapeTest(unittest.TestCase):
2223 def test_empty(self):
2224 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2225 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2226
2227 def test_raw_encode(self):
2228 encode = codecs.raw_unicode_escape_encode
2229 for b in range(256):
2230 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2231
2232 def test_raw_decode(self):
2233 decode = codecs.raw_unicode_escape_decode
2234 for b in range(256):
2235 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2236
2237 def test_escape_encode(self):
2238 encode = codecs.raw_unicode_escape_encode
2239 check = coding_checker(self, encode)
2240 for b in range(256):
2241 if b not in b'uU':
2242 check('\\' + chr(b), b'\\' + bytes([b]))
2243 check('\u20ac', br'\u20ac')
2244 check('\U0001d120', br'\U0001d120')
2245
2246 def test_escape_decode(self):
2247 decode = codecs.raw_unicode_escape_decode
2248 check = coding_checker(self, decode)
2249 for b in range(256):
2250 if b not in b'uU':
2251 check(b'\\' + bytes([b]), '\\' + chr(b))
2252 check(br"\u20ac", "\u20ac")
2253 check(br"\U0001d120", "\U0001d120")
2254
2255 def test_decode_errors(self):
2256 decode = codecs.raw_unicode_escape_decode
2257 for c, d in (b'u', 4), (b'U', 4):
2258 for i in range(d):
2259 self.assertRaises(UnicodeDecodeError, decode,
2260 b"\\" + c + b"0"*i)
2261 self.assertRaises(UnicodeDecodeError, decode,
2262 b"[\\" + c + b"0"*i + b"]")
2263 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2264 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2265 self.assertEqual(decode(data, "replace"),
2266 ("[\ufffd]\ufffd", len(data)))
2267 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2268 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2269 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2270
2271
Martin v. Löwis43c57782009-05-10 08:15:24 +00002272class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002273
2274 def test_utf8(self):
2275 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002276 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002277 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002278 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002279 b"foo\x80bar")
2280 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002281 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002282 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002283 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002284 b"\xed\xb0\x80")
2285
2286 def test_ascii(self):
2287 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002288 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002289 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002290 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002291 b"foo\x80bar")
2292
2293 def test_charmap(self):
2294 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002295 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002296 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002297 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002298 b"foo\xa5bar")
2299
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002300 def test_latin1(self):
2301 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002302 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002303 b"\xe4\xeb\xef\xf6\xfc")
2304
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002305
Victor Stinner3fed0872010-05-22 02:16:27 +00002306class BomTest(unittest.TestCase):
2307 def test_seek0(self):
2308 data = "1234567890"
2309 tests = ("utf-16",
2310 "utf-16-le",
2311 "utf-16-be",
2312 "utf-32",
2313 "utf-32-le",
2314 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002315 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002316 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002317 # Check if the BOM is written only once
2318 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002319 f.write(data)
2320 f.write(data)
2321 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002322 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002323 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002324 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002325
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002326 # Check that the BOM is written after a seek(0)
2327 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2328 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002329 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002330 f.seek(0)
2331 f.write(data)
2332 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002333 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002334
2335 # (StreamWriter) Check that the BOM is written after a seek(0)
2336 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002337 f.writer.write(data[0])
2338 self.assertNotEqual(f.writer.tell(), 0)
2339 f.writer.seek(0)
2340 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002341 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002342 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002343
Victor Stinner05010702011-05-27 16:50:40 +02002344 # Check that the BOM is not written after a seek() at a position
2345 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002346 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2347 f.write(data)
2348 f.seek(f.tell())
2349 f.write(data)
2350 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002351 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002352
Victor Stinner05010702011-05-27 16:50:40 +02002353 # (StreamWriter) Check that the BOM is not written after a seek()
2354 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002355 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002356 f.writer.write(data)
2357 f.writer.seek(f.writer.tell())
2358 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002359 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002360 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002361
Victor Stinner3fed0872010-05-22 02:16:27 +00002362
Georg Brandl02524622010-12-02 18:06:51 +00002363bytes_transform_encodings = [
2364 "base64_codec",
2365 "uu_codec",
2366 "quopri_codec",
2367 "hex_codec",
2368]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002369
2370transform_aliases = {
2371 "base64_codec": ["base64", "base_64"],
2372 "uu_codec": ["uu"],
2373 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2374 "hex_codec": ["hex"],
2375 "rot_13": ["rot13"],
2376}
2377
Georg Brandl02524622010-12-02 18:06:51 +00002378try:
2379 import zlib
2380except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002381 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002382else:
2383 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002384 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002385try:
2386 import bz2
2387except ImportError:
2388 pass
2389else:
2390 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002391 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002392
2393class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002394
Georg Brandl02524622010-12-02 18:06:51 +00002395 def test_basics(self):
2396 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002397 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002398 with self.subTest(encoding=encoding):
2399 # generic codecs interface
2400 (o, size) = codecs.getencoder(encoding)(binput)
2401 self.assertEqual(size, len(binput))
2402 (i, size) = codecs.getdecoder(encoding)(o)
2403 self.assertEqual(size, len(o))
2404 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002405
Georg Brandl02524622010-12-02 18:06:51 +00002406 def test_read(self):
2407 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002408 with self.subTest(encoding=encoding):
2409 sin = codecs.encode(b"\x80", encoding)
2410 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2411 sout = reader.read()
2412 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002413
2414 def test_readline(self):
2415 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002416 with self.subTest(encoding=encoding):
2417 sin = codecs.encode(b"\x80", encoding)
2418 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2419 sout = reader.readline()
2420 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002421
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002422 def test_buffer_api_usage(self):
2423 # We check all the transform codecs accept memoryview input
2424 # for encoding and decoding
2425 # and also that they roundtrip correctly
2426 original = b"12345\x80"
2427 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002428 with self.subTest(encoding=encoding):
2429 data = original
2430 view = memoryview(data)
2431 data = codecs.encode(data, encoding)
2432 view_encoded = codecs.encode(view, encoding)
2433 self.assertEqual(view_encoded, data)
2434 view = memoryview(data)
2435 data = codecs.decode(data, encoding)
2436 self.assertEqual(data, original)
2437 view_decoded = codecs.decode(view, encoding)
2438 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002439
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002440 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002441 # Check binary -> binary codecs give a good error for str input
2442 bad_input = "bad input type"
2443 for encoding in bytes_transform_encodings:
2444 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002445 fmt = ( "{!r} is not a text encoding; "
2446 "use codecs.encode\(\) to handle arbitrary codecs")
2447 msg = fmt.format(encoding)
2448 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002449 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002450 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002451
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002452 def test_text_to_binary_blacklists_text_transforms(self):
2453 # Check str.encode gives a good error message for str -> str codecs
2454 msg = (r"^'rot_13' is not a text encoding; "
2455 "use codecs.encode\(\) to handle arbitrary codecs")
2456 with self.assertRaisesRegex(LookupError, msg):
2457 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002458
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002459 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002460 # Check bytes.decode and bytearray.decode give a good error
2461 # message for binary -> binary codecs
2462 data = b"encode first to ensure we meet any format restrictions"
2463 for encoding in bytes_transform_encodings:
2464 with self.subTest(encoding=encoding):
2465 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002466 fmt = (r"{!r} is not a text encoding; "
2467 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002468 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002469 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002470 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002471 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002472 bytearray(encoded_data).decode(encoding)
2473
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002474 def test_binary_to_text_blacklists_text_transforms(self):
2475 # Check str -> str codec gives a good error for binary input
2476 for bad_input in (b"immutable", bytearray(b"mutable")):
2477 with self.subTest(bad_input=bad_input):
2478 msg = (r"^'rot_13' is not a text encoding; "
2479 "use codecs.decode\(\) to handle arbitrary codecs")
2480 with self.assertRaisesRegex(LookupError, msg) as failure:
2481 bad_input.decode("rot_13")
2482 self.assertIsNone(failure.exception.__cause__)
2483
Zachary Wareefa2e042013-12-30 14:54:11 -06002484 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002485 def test_custom_zlib_error_is_wrapped(self):
2486 # Check zlib codec gives a good error for malformed input
2487 msg = "^decoding with 'zlib_codec' codec failed"
2488 with self.assertRaisesRegex(Exception, msg) as failure:
2489 codecs.decode(b"hello", "zlib_codec")
2490 self.assertIsInstance(failure.exception.__cause__,
2491 type(failure.exception))
2492
2493 def test_custom_hex_error_is_wrapped(self):
2494 # Check hex codec gives a good error for malformed input
2495 msg = "^decoding with 'hex_codec' codec failed"
2496 with self.assertRaisesRegex(Exception, msg) as failure:
2497 codecs.decode(b"hello", "hex_codec")
2498 self.assertIsInstance(failure.exception.__cause__,
2499 type(failure.exception))
2500
2501 # Unfortunately, the bz2 module throws OSError, which the codec
2502 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002503
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002504 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2505 def test_aliases(self):
2506 for codec_name, aliases in transform_aliases.items():
2507 expected_name = codecs.lookup(codec_name).name
2508 for alias in aliases:
2509 with self.subTest(alias=alias):
2510 info = codecs.lookup(alias)
2511 self.assertEqual(info.name, expected_name)
2512
Nick Coghlan8b097b42013-11-13 23:49:21 +10002513
2514# The codec system tries to wrap exceptions in order to ensure the error
2515# mentions the operation being performed and the codec involved. We
2516# currently *only* want this to happen for relatively stateless
2517# exceptions, where the only significant information they contain is their
2518# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002519
2520# Use a local codec registry to avoid appearing to leak objects when
2521# registering multiple seach functions
2522_TEST_CODECS = {}
2523
2524def _get_test_codec(codec_name):
2525 return _TEST_CODECS.get(codec_name)
2526codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2527
Nick Coghlan8b097b42013-11-13 23:49:21 +10002528class ExceptionChainingTest(unittest.TestCase):
2529
2530 def setUp(self):
2531 # There's no way to unregister a codec search function, so we just
2532 # ensure we render this one fairly harmless after the test
2533 # case finishes by using the test case repr as the codec name
2534 # The codecs module normalizes codec names, although this doesn't
2535 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002536 # We also make sure we use a truly unique id for the custom codec
2537 # to avoid issues with the codec cache when running these tests
2538 # multiple times (e.g. when hunting for refleaks)
2539 unique_id = repr(self) + str(id(self))
2540 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2541
2542 # We store the object to raise on the instance because of a bad
2543 # interaction between the codec caching (which means we can't
2544 # recreate the codec entry) and regrtest refleak hunting (which
2545 # runs the same test instance multiple times). This means we
2546 # need to ensure the codecs call back in to the instance to find
2547 # out which exception to raise rather than binding them in a
2548 # closure to an object that may change on the next run
2549 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002550
Nick Coghlan4e553e22013-11-16 00:35:34 +10002551 def tearDown(self):
2552 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002553
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002554 def set_codec(self, encode, decode):
2555 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002556 name=self.codec_name)
2557 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002558
2559 @contextlib.contextmanager
2560 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002561 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002562 operation, self.codec_name, exc_type.__name__, msg)
2563 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2564 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002565 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002566 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002567
2568 def raise_obj(self, *args, **kwds):
2569 # Helper to dynamically change the object raised by a test codec
2570 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002571
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002572 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002573 self.obj_to_raise = obj_to_raise
2574 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002575 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002576 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002577 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002578 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002579 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002580 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002581 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002582 codecs.decode(b"bytes input", self.codec_name)
2583
2584 def test_raise_by_type(self):
2585 self.check_wrapped(RuntimeError, "")
2586
2587 def test_raise_by_value(self):
2588 msg = "This should be wrapped"
2589 self.check_wrapped(RuntimeError(msg), msg)
2590
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002591 def test_raise_grandchild_subclass_exact_size(self):
2592 msg = "This should be wrapped"
2593 class MyRuntimeError(RuntimeError):
2594 __slots__ = ()
2595 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2596
2597 def test_raise_subclass_with_weakref_support(self):
2598 msg = "This should be wrapped"
2599 class MyRuntimeError(RuntimeError):
2600 pass
2601 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2602
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002603 def check_not_wrapped(self, obj_to_raise, msg):
2604 def raise_obj(*args, **kwds):
2605 raise obj_to_raise
2606 self.set_codec(raise_obj, raise_obj)
2607 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002608 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002609 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002610 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002611 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002612 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002613 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002614 codecs.decode(b"bytes input", self.codec_name)
2615
2616 def test_init_override_is_not_wrapped(self):
2617 class CustomInit(RuntimeError):
2618 def __init__(self):
2619 pass
2620 self.check_not_wrapped(CustomInit, "")
2621
2622 def test_new_override_is_not_wrapped(self):
2623 class CustomNew(RuntimeError):
2624 def __new__(cls):
2625 return super().__new__(cls)
2626 self.check_not_wrapped(CustomNew, "")
2627
2628 def test_instance_attribute_is_not_wrapped(self):
2629 msg = "This should NOT be wrapped"
2630 exc = RuntimeError(msg)
2631 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002632 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002633
2634 def test_non_str_arg_is_not_wrapped(self):
2635 self.check_not_wrapped(RuntimeError(1), "1")
2636
2637 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002638 msg_re = r"^\('a', 'b', 'c'\)$"
2639 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002640
2641 # http://bugs.python.org/issue19609
2642 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002643 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002644 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002646 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002647 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002648 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002649 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002650 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002651 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002652 codecs.decode(b"bytes input", self.codec_name)
2653
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002654 def test_unflagged_non_text_codec_handling(self):
2655 # The stdlib non-text codecs are now marked so they're
2656 # pre-emptively skipped by the text model related methods
2657 # However, third party codecs won't be flagged, so we still make
2658 # sure the case where an inappropriate output type is produced is
2659 # handled appropriately
2660 def encode_to_str(*args, **kwds):
2661 return "not bytes!", 0
2662 def decode_to_bytes(*args, **kwds):
2663 return b"not str!", 0
2664 self.set_codec(encode_to_str, decode_to_bytes)
2665 # No input or output type checks on the codecs module functions
2666 encoded = codecs.encode(None, self.codec_name)
2667 self.assertEqual(encoded, "not bytes!")
2668 decoded = codecs.decode(None, self.codec_name)
2669 self.assertEqual(decoded, b"not str!")
2670 # Text model methods should complain
2671 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2672 "use codecs.encode\(\) to encode to arbitrary types$")
2673 msg = fmt.format(self.codec_name)
2674 with self.assertRaisesRegex(TypeError, msg):
2675 "str_input".encode(self.codec_name)
2676 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2677 "use codecs.decode\(\) to decode to arbitrary types$")
2678 msg = fmt.format(self.codec_name)
2679 with self.assertRaisesRegex(TypeError, msg):
2680 b"bytes input".decode(self.codec_name)
2681
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002682
Georg Brandl02524622010-12-02 18:06:51 +00002683
Victor Stinner62be4fb2011-10-18 21:46:37 +02002684@unittest.skipUnless(sys.platform == 'win32',
2685 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002686class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002687 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002688 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002689
Victor Stinner3a50e702011-10-18 21:21:00 +02002690 def test_invalid_code_page(self):
2691 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2692 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002693 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2694 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002695
2696 def test_code_page_name(self):
2697 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2698 codecs.code_page_encode, 932, '\xff')
2699 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2700 codecs.code_page_decode, 932, b'\x81\x00')
2701 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2702 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2703
2704 def check_decode(self, cp, tests):
2705 for raw, errors, expected in tests:
2706 if expected is not None:
2707 try:
2708 decoded = codecs.code_page_decode(cp, raw, errors)
2709 except UnicodeDecodeError as err:
2710 self.fail('Unable to decode %a from "cp%s" with '
2711 'errors=%r: %s' % (raw, cp, errors, err))
2712 self.assertEqual(decoded[0], expected,
2713 '%a.decode("cp%s", %r)=%a != %a'
2714 % (raw, cp, errors, decoded[0], expected))
2715 # assert 0 <= decoded[1] <= len(raw)
2716 self.assertGreaterEqual(decoded[1], 0)
2717 self.assertLessEqual(decoded[1], len(raw))
2718 else:
2719 self.assertRaises(UnicodeDecodeError,
2720 codecs.code_page_decode, cp, raw, errors)
2721
2722 def check_encode(self, cp, tests):
2723 for text, errors, expected in tests:
2724 if expected is not None:
2725 try:
2726 encoded = codecs.code_page_encode(cp, text, errors)
2727 except UnicodeEncodeError as err:
2728 self.fail('Unable to encode %a to "cp%s" with '
2729 'errors=%r: %s' % (text, cp, errors, err))
2730 self.assertEqual(encoded[0], expected,
2731 '%a.encode("cp%s", %r)=%a != %a'
2732 % (text, cp, errors, encoded[0], expected))
2733 self.assertEqual(encoded[1], len(text))
2734 else:
2735 self.assertRaises(UnicodeEncodeError,
2736 codecs.code_page_encode, cp, text, errors)
2737
2738 def test_cp932(self):
2739 self.check_encode(932, (
2740 ('abc', 'strict', b'abc'),
2741 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002742 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002743 ('\xff', 'strict', None),
2744 ('[\xff]', 'ignore', b'[]'),
2745 ('[\xff]', 'replace', b'[y]'),
2746 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002747 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2748 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002749 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002750 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002751 (b'abc', 'strict', 'abc'),
2752 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2753 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002754 (b'[\xff]', 'strict', None),
2755 (b'[\xff]', 'ignore', '[]'),
2756 (b'[\xff]', 'replace', '[\ufffd]'),
2757 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002758 (b'\x81\x00abc', 'strict', None),
2759 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002760 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2761 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002762
2763 def test_cp1252(self):
2764 self.check_encode(1252, (
2765 ('abc', 'strict', b'abc'),
2766 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2767 ('\xff', 'strict', b'\xff'),
2768 ('\u0141', 'strict', None),
2769 ('\u0141', 'ignore', b''),
2770 ('\u0141', 'replace', b'L'),
2771 ))
2772 self.check_decode(1252, (
2773 (b'abc', 'strict', 'abc'),
2774 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2775 (b'\xff', 'strict', '\xff'),
2776 ))
2777
2778 def test_cp_utf7(self):
2779 cp = 65000
2780 self.check_encode(cp, (
2781 ('abc', 'strict', b'abc'),
2782 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2783 ('\U0010ffff', 'strict', b'+2//f/w-'),
2784 ('\udc80', 'strict', b'+3IA-'),
2785 ('\ufffd', 'strict', b'+//0-'),
2786 ))
2787 self.check_decode(cp, (
2788 (b'abc', 'strict', 'abc'),
2789 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2790 (b'+2//f/w-', 'strict', '\U0010ffff'),
2791 (b'+3IA-', 'strict', '\udc80'),
2792 (b'+//0-', 'strict', '\ufffd'),
2793 # invalid bytes
2794 (b'[+/]', 'strict', '[]'),
2795 (b'[\xff]', 'strict', '[\xff]'),
2796 ))
2797
Victor Stinner3a50e702011-10-18 21:21:00 +02002798 def test_multibyte_encoding(self):
2799 self.check_decode(932, (
2800 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2801 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2802 ))
2803 self.check_decode(self.CP_UTF8, (
2804 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2805 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2806 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002807 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002808 self.check_encode(self.CP_UTF8, (
2809 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2810 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2811 ))
2812
2813 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002814 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2815 self.assertEqual(decoded, ('', 0))
2816
Victor Stinner3a50e702011-10-18 21:21:00 +02002817 decoded = codecs.code_page_decode(932,
2818 b'\xe9\x80\xe9', 'strict',
2819 False)
2820 self.assertEqual(decoded, ('\u9a3e', 2))
2821
2822 decoded = codecs.code_page_decode(932,
2823 b'\xe9\x80\xe9\x80', 'strict',
2824 False)
2825 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2826
2827 decoded = codecs.code_page_decode(932,
2828 b'abc', 'strict',
2829 False)
2830 self.assertEqual(decoded, ('abc', 3))
2831
2832
Fred Drake2e2be372001-09-20 21:33:42 +00002833if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002834 unittest.main()