blob: e03a1db325afc9bc1aaf732fabc8ce2d8948c668 [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10002import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10008import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +01009
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200150 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200154 for (i, lineend) in enumerate(lineends):
155 vw.append((i*200+200)*"\u3042" + lineend)
156 vwo.append((i*200+200)*"\u3042")
157 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
158 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200163 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200171 self.assertEqual(
172 reader.readline(keepends=True),
173 "xxx\n",
174 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000176 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000177 self.assertEqual(
178 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000179 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000180 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200181 self.assertEqual(
182 reader.readline(keepends=False),
183 "xxx",
184 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000185
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200186 def test_mixed_readline_and_read(self):
187 lines = ["Humpty Dumpty sat on a wall,\n",
188 "Humpty Dumpty had a great fall.\r\n",
189 "All the king's horses and all the king's men\r",
190 "Couldn't put Humpty together again."]
191 data = ''.join(lines)
192 def getreader():
193 stream = io.BytesIO(data.encode(self.encoding))
194 return codecs.getreader(self.encoding)(stream)
195
196 # Issue #8260: Test readline() followed by read()
197 f = getreader()
198 self.assertEqual(f.readline(), lines[0])
199 self.assertEqual(f.read(), ''.join(lines[1:]))
200 self.assertEqual(f.read(), '')
201
202 # Issue #16636: Test readline() followed by readlines()
203 f = getreader()
204 self.assertEqual(f.readline(), lines[0])
205 self.assertEqual(f.readlines(), lines[1:])
206 self.assertEqual(f.read(), '')
207
208 # Test read() followed by read()
209 f = getreader()
210 self.assertEqual(f.read(size=40, chars=5), data[:5])
211 self.assertEqual(f.read(), data[5:])
212 self.assertEqual(f.read(), '')
213
214 # Issue #12446: Test read() followed by readlines()
215 f = getreader()
216 self.assertEqual(f.read(size=40, chars=5), data[:5])
217 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
218 self.assertEqual(f.read(), '')
219
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000220 def test_bug1175396(self):
221 s = [
222 '<%!--===================================================\r\n',
223 ' BLOG index page: show recent articles,\r\n',
224 ' today\'s articles, or articles of a specific date.\r\n',
225 '========================================================--%>\r\n',
226 '<%@inputencoding="ISO-8859-1"%>\r\n',
227 '<%@pagetemplate=TEMPLATE.y%>\r\n',
228 '<%@import=import frog.util, frog%>\r\n',
229 '<%@import=import frog.objects%>\r\n',
230 '<%@import=from frog.storageerrors import StorageError%>\r\n',
231 '<%\r\n',
232 '\r\n',
233 'import logging\r\n',
234 'log=logging.getLogger("Snakelets.logger")\r\n',
235 '\r\n',
236 '\r\n',
237 'user=self.SessionCtx.user\r\n',
238 'storageEngine=self.SessionCtx.storageEngine\r\n',
239 '\r\n',
240 '\r\n',
241 'def readArticlesFromDate(date, count=None):\r\n',
242 ' entryids=storageEngine.listBlogEntries(date)\r\n',
243 ' entryids.reverse() # descending\r\n',
244 ' if count:\r\n',
245 ' entryids=entryids[:count]\r\n',
246 ' try:\r\n',
247 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
248 ' except StorageError,x:\r\n',
249 ' log.error("Error loading articles: "+str(x))\r\n',
250 ' self.abort("cannot load articles")\r\n',
251 '\r\n',
252 'showdate=None\r\n',
253 '\r\n',
254 'arg=self.Request.getArg()\r\n',
255 'if arg=="today":\r\n',
256 ' #-------------------- TODAY\'S ARTICLES\r\n',
257 ' self.write("<h2>Today\'s articles</h2>")\r\n',
258 ' showdate = frog.util.isodatestr() \r\n',
259 ' entries = readArticlesFromDate(showdate)\r\n',
260 'elif arg=="active":\r\n',
261 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
262 ' self.Yredirect("active.y")\r\n',
263 'elif arg=="login":\r\n',
264 ' #-------------------- LOGIN PAGE redirect\r\n',
265 ' self.Yredirect("login.y")\r\n',
266 'elif arg=="date":\r\n',
267 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
268 ' showdate = self.Request.getParameter("date")\r\n',
269 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
270 ' entries = readArticlesFromDate(showdate)\r\n',
271 'else:\r\n',
272 ' #-------------------- RECENT ARTICLES\r\n',
273 ' self.write("<h2>Recent articles</h2>")\r\n',
274 ' dates=storageEngine.listBlogEntryDates()\r\n',
275 ' if dates:\r\n',
276 ' entries=[]\r\n',
277 ' SHOWAMOUNT=10\r\n',
278 ' for showdate in dates:\r\n',
279 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
280 ' if len(entries)>=SHOWAMOUNT:\r\n',
281 ' break\r\n',
282 ' \r\n',
283 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000286 for (i, line) in enumerate(reader):
287 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000288
289 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200291 writer = codecs.getwriter(self.encoding)(q)
292 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000293
294 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 writer.write("foo\r")
296 self.assertEqual(reader.readline(keepends=False), "foo")
297 writer.write("\nbar\r")
298 self.assertEqual(reader.readline(keepends=False), "")
299 self.assertEqual(reader.readline(keepends=False), "bar")
300 writer.write("baz")
301 self.assertEqual(reader.readline(keepends=False), "baz")
302 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303
304 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000305 writer.write("foo\r")
306 self.assertEqual(reader.readline(keepends=True), "foo\r")
307 writer.write("\nbar\r")
308 self.assertEqual(reader.readline(keepends=True), "\n")
309 self.assertEqual(reader.readline(keepends=True), "bar\r")
310 writer.write("baz")
311 self.assertEqual(reader.readline(keepends=True), "baz")
312 self.assertEqual(reader.readline(keepends=True), "")
313 writer.write("foo\r\n")
314 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000315
Walter Dörwald9fa09462005-01-10 12:01:39 +0000316 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000317 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
318 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
319 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000320
321 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000322 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200323 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000324 self.assertEqual(reader.readline(), s1)
325 self.assertEqual(reader.readline(), s2)
326 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000327 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000328
329 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000330 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
331 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
332 s3 = "stillokay:bbbbxx\r\n"
333 s4 = "broken!!!!badbad\r\n"
334 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000335
336 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000337 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200338 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000339 self.assertEqual(reader.readline(), s1)
340 self.assertEqual(reader.readline(), s2)
341 self.assertEqual(reader.readline(), s3)
342 self.assertEqual(reader.readline(), s4)
343 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000344 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200346 ill_formed_sequence_replace = "\ufffd"
347
348 def test_lone_surrogates(self):
349 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
350 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
351 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200352 self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
353 "[\\udc80]".encode(self.encoding))
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200354 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
355 "[&#56448;]".encode(self.encoding))
356 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
357 "[]".encode(self.encoding))
358 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
359 "[?]".encode(self.encoding))
360
361 bom = "".encode(self.encoding)
362 for before, after in [("\U00010fff", "A"), ("[", "]"),
363 ("A", "\U00010fff")]:
364 before_sequence = before.encode(self.encoding)[len(bom):]
365 after_sequence = after.encode(self.encoding)[len(bom):]
366 test_string = before + "\uDC80" + after
367 test_sequence = (bom + before_sequence +
368 self.ill_formed_sequence + after_sequence)
369 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
370 self.encoding)
371 self.assertEqual(test_string.encode(self.encoding,
372 "surrogatepass"),
373 test_sequence)
374 self.assertEqual(test_sequence.decode(self.encoding,
375 "surrogatepass"),
376 test_string)
377 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
378 before + after)
379 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
380 before + self.ill_formed_sequence_replace + after)
381
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200382class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000383 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200384 if sys.byteorder == 'little':
385 ill_formed_sequence = b"\x80\xdc\x00\x00"
386 else:
387 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000388
389 spamle = (b'\xff\xfe\x00\x00'
390 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
391 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
392 spambe = (b'\x00\x00\xfe\xff'
393 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
394 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
395
396 def test_only_one_bom(self):
397 _,_,reader,writer = codecs.lookup(self.encoding)
398 # encode some stream
399 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200400 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000401 f.write("spam")
402 f.write("spam")
403 d = s.getvalue()
404 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000405 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406 # try to read it back
407 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200408 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000409 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410
411 def test_badbom(self):
412 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200413 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000414 self.assertRaises(UnicodeError, f.read)
415
416 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200417 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000418 self.assertRaises(UnicodeError, f.read)
419
420 def test_partial(self):
421 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200422 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000423 [
424 "", # first byte of BOM read
425 "", # second byte of BOM read
426 "", # third byte of BOM read
427 "", # fourth byte of BOM read => byteorder known
428 "",
429 "",
430 "",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff",
446 "\x00\xff\u0100\uffff",
447 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000448 ]
449 )
450
Georg Brandl791f4e12009-09-17 11:41:24 +0000451 def test_handlers(self):
452 self.assertEqual(('\ufffd', 1),
453 codecs.utf_32_decode(b'\x01', 'replace', True))
454 self.assertEqual(('', 1),
455 codecs.utf_32_decode(b'\x01', 'ignore', True))
456
Walter Dörwald41980ca2007-08-16 21:55:45 +0000457 def test_errors(self):
458 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
459 b"\xff", "strict", True)
460
461 def test_decoder_state(self):
462 self.check_state_handling_decode(self.encoding,
463 "spamspam", self.spamle)
464 self.check_state_handling_decode(self.encoding,
465 "spamspam", self.spambe)
466
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000467 def test_issue8941(self):
468 # Issue #8941: insufficient result allocation when decoding into
469 # surrogate pairs on UCS-2 builds.
470 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
471 self.assertEqual('\U00010000' * 1024,
472 codecs.utf_32_decode(encoded_le)[0])
473 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
474 self.assertEqual('\U00010000' * 1024,
475 codecs.utf_32_decode(encoded_be)[0])
476
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200477class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200479 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480
481 def test_partial(self):
482 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200483 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484 [
485 "",
486 "",
487 "",
488 "\x00",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000505 ]
506 )
507
508 def test_simple(self):
509 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
510
511 def test_errors(self):
512 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
513 b"\xff", "strict", True)
514
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000515 def test_issue8941(self):
516 # Issue #8941: insufficient result allocation when decoding into
517 # surrogate pairs on UCS-2 builds.
518 encoded = b'\x00\x00\x01\x00' * 1024
519 self.assertEqual('\U00010000' * 1024,
520 codecs.utf_32_le_decode(encoded)[0])
521
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200522class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000523 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200524 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000525
526 def test_partial(self):
527 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200528 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000529 [
530 "",
531 "",
532 "",
533 "\x00",
534 "\x00",
535 "\x00",
536 "\x00",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff",
540 "\x00\xff",
541 "\x00\xff\u0100",
542 "\x00\xff\u0100",
543 "\x00\xff\u0100",
544 "\x00\xff\u0100",
545 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200546 "\x00\xff\u0100\uffff",
547 "\x00\xff\u0100\uffff",
548 "\x00\xff\u0100\uffff",
549 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000550 ]
551 )
552
553 def test_simple(self):
554 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
555
556 def test_errors(self):
557 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
558 b"\xff", "strict", True)
559
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000560 def test_issue8941(self):
561 # Issue #8941: insufficient result allocation when decoding into
562 # surrogate pairs on UCS-2 builds.
563 encoded = b'\x00\x01\x00\x00' * 1024
564 self.assertEqual('\U00010000' * 1024,
565 codecs.utf_32_be_decode(encoded)[0])
566
567
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200568class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200570 if sys.byteorder == 'little':
571 ill_formed_sequence = b"\x80\xdc"
572 else:
573 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000574
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000575 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
576 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000577
578 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000580 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000581 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200582 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000583 f.write("spam")
584 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000585 d = s.getvalue()
586 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000587 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000588 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000589 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200590 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000591 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000592
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000593 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000594 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200595 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000596 self.assertRaises(UnicodeError, f.read)
597
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000598 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200599 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000600 self.assertRaises(UnicodeError, f.read)
601
Walter Dörwald69652032004-09-07 20:24:22 +0000602 def test_partial(self):
603 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200604 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000605 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000606 "", # first byte of BOM read
607 "", # second byte of BOM read => byteorder known
608 "",
609 "\x00",
610 "\x00",
611 "\x00\xff",
612 "\x00\xff",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100",
615 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff",
619 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000620 ]
621 )
622
Georg Brandl791f4e12009-09-17 11:41:24 +0000623 def test_handlers(self):
624 self.assertEqual(('\ufffd', 1),
625 codecs.utf_16_decode(b'\x01', 'replace', True))
626 self.assertEqual(('', 1),
627 codecs.utf_16_decode(b'\x01', 'ignore', True))
628
Walter Dörwalde22d3392005-11-17 08:52:34 +0000629 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000630 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000631 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000632
633 def test_decoder_state(self):
634 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000635 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000636 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000637 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000638
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000639 def test_bug691291(self):
640 # Files are always opened in binary mode, even if no binary mode was
641 # specified. This means that no automatic conversion of '\n' is done
642 # on reading and writing.
643 s1 = 'Hello\r\nworld\r\n'
644
645 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200646 self.addCleanup(support.unlink, support.TESTFN)
647 with open(support.TESTFN, 'wb') as fp:
648 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200649 with support.check_warnings(('', DeprecationWarning)):
650 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
651 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200652 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000653
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200654class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000655 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200656 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000657
658 def test_partial(self):
659 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200660 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "\x00",
664 "\x00",
665 "\x00\xff",
666 "\x00\xff",
667 "\x00\xff\u0100",
668 "\x00\xff\u0100",
669 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff",
672 "\x00\xff\u0100\uffff",
673 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000674 ]
675 )
676
Walter Dörwalde22d3392005-11-17 08:52:34 +0000677 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200678 tests = [
679 (b'\xff', '\ufffd'),
680 (b'A\x00Z', 'A\ufffd'),
681 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
682 (b'\x00\xd8', '\ufffd'),
683 (b'\x00\xd8A', '\ufffd'),
684 (b'\x00\xd8A\x00', '\ufffdA'),
685 (b'\x00\xdcA\x00', '\ufffdA'),
686 ]
687 for raw, expected in tests:
688 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
689 raw, 'strict', True)
690 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000691
Victor Stinner53a9dd72010-12-08 22:25:45 +0000692 def test_nonbmp(self):
693 self.assertEqual("\U00010203".encode(self.encoding),
694 b'\x00\xd8\x03\xde')
695 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
696 "\U00010203")
697
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200698class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000699 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200700 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000701
702 def test_partial(self):
703 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200704 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000705 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 "",
707 "\x00",
708 "\x00",
709 "\x00\xff",
710 "\x00\xff",
711 "\x00\xff\u0100",
712 "\x00\xff\u0100",
713 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200714 "\x00\xff\u0100\uffff",
715 "\x00\xff\u0100\uffff",
716 "\x00\xff\u0100\uffff",
717 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000718 ]
719 )
720
Walter Dörwalde22d3392005-11-17 08:52:34 +0000721 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200722 tests = [
723 (b'\xff', '\ufffd'),
724 (b'\x00A\xff', 'A\ufffd'),
725 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
726 (b'\xd8\x00', '\ufffd'),
727 (b'\xd8\x00\xdc', '\ufffd'),
728 (b'\xd8\x00\x00A', '\ufffdA'),
729 (b'\xdc\x00\x00A', '\ufffdA'),
730 ]
731 for raw, expected in tests:
732 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
733 raw, 'strict', True)
734 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000735
Victor Stinner53a9dd72010-12-08 22:25:45 +0000736 def test_nonbmp(self):
737 self.assertEqual("\U00010203".encode(self.encoding),
738 b'\xd8\x00\xde\x03')
739 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
740 "\U00010203")
741
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200742class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000743 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200744 ill_formed_sequence = b"\xed\xb2\x80"
745 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000746
747 def test_partial(self):
748 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200749 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000750 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000751 "\x00",
752 "\x00",
753 "\x00\xff",
754 "\x00\xff",
755 "\x00\xff\u07ff",
756 "\x00\xff\u07ff",
757 "\x00\xff\u07ff",
758 "\x00\xff\u07ff\u0800",
759 "\x00\xff\u07ff\u0800",
760 "\x00\xff\u07ff\u0800",
761 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200762 "\x00\xff\u07ff\u0800\uffff",
763 "\x00\xff\u07ff\u0800\uffff",
764 "\x00\xff\u07ff\u0800\uffff",
765 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000766 ]
767 )
768
Walter Dörwald3abcb012007-04-16 22:10:50 +0000769 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000770 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000771 self.check_state_handling_decode(self.encoding,
772 u, u.encode(self.encoding))
773
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000774 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200775 super().test_lone_surrogates()
776 # not sure if this is making sense for
777 # UTF-16 and UTF-32
778 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000779 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000780
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000781 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000782 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
783 b"abc\xed\xa0\x80def")
784 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
785 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200786 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
787 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
788 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
789 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000790 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700791 with self.assertRaises(UnicodeDecodeError):
792 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200793 with self.assertRaises(UnicodeDecodeError):
794 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000795
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200796@unittest.skipUnless(sys.platform == 'win32',
797 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200798class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200799 encoding = "cp65001"
800
801 def test_encode(self):
802 tests = [
803 ('abc', 'strict', b'abc'),
804 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
805 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
806 ]
807 if VISTA_OR_LATER:
808 tests.extend((
809 ('\udc80', 'strict', None),
810 ('\udc80', 'ignore', b''),
811 ('\udc80', 'replace', b'?'),
812 ('\udc80', 'backslashreplace', b'\\udc80'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200813 ('\udc80', 'namereplace', b'\\udc80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200814 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
815 ))
816 else:
817 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
818 for text, errors, expected in tests:
819 if expected is not None:
820 try:
821 encoded = text.encode('cp65001', errors)
822 except UnicodeEncodeError as err:
823 self.fail('Unable to encode %a to cp65001 with '
824 'errors=%r: %s' % (text, errors, err))
825 self.assertEqual(encoded, expected,
826 '%a.encode("cp65001", %r)=%a != %a'
827 % (text, errors, encoded, expected))
828 else:
829 self.assertRaises(UnicodeEncodeError,
830 text.encode, "cp65001", errors)
831
832 def test_decode(self):
833 tests = [
834 (b'abc', 'strict', 'abc'),
835 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
836 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
837 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
838 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
839 # invalid bytes
840 (b'[\xff]', 'strict', None),
841 (b'[\xff]', 'ignore', '[]'),
842 (b'[\xff]', 'replace', '[\ufffd]'),
843 (b'[\xff]', 'surrogateescape', '[\udcff]'),
844 ]
845 if VISTA_OR_LATER:
846 tests.extend((
847 (b'[\xed\xb2\x80]', 'strict', None),
848 (b'[\xed\xb2\x80]', 'ignore', '[]'),
849 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
850 ))
851 else:
852 tests.extend((
853 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
854 ))
855 for raw, errors, expected in tests:
856 if expected is not None:
857 try:
858 decoded = raw.decode('cp65001', errors)
859 except UnicodeDecodeError as err:
860 self.fail('Unable to decode %a from cp65001 with '
861 'errors=%r: %s' % (raw, errors, err))
862 self.assertEqual(decoded, expected,
863 '%a.decode("cp65001", %r)=%a != %a'
864 % (raw, errors, decoded, expected))
865 else:
866 self.assertRaises(UnicodeDecodeError,
867 raw.decode, 'cp65001', errors)
868
869 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
870 def test_lone_surrogates(self):
871 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
872 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
873 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
874 b'[\\udc80]')
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200875 self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
876 b'[\\udc80]')
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200877 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
878 b'[&#56448;]')
879 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
880 b'[\x80]')
881 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
882 b'[]')
883 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
884 b'[?]')
885
886 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
887 def test_surrogatepass_handler(self):
888 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
889 b"abc\xed\xa0\x80def")
890 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
891 "abc\ud800def")
892 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
893 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
894 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
895 "\U00010fff\uD800")
896 self.assertTrue(codecs.lookup_error("surrogatepass"))
897
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200898
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200899class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000900 encoding = "utf-7"
901
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000902 def test_partial(self):
903 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200904 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000905 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200906 'a',
907 'a',
908 'a+',
909 'a+-',
910 'a+-b',
911 'a+-b',
912 'a+-b',
913 'a+-b',
914 'a+-b',
915 'a+-b\x00',
916 'a+-b\x00c',
917 'a+-b\x00c',
918 'a+-b\x00c',
919 'a+-b\x00c',
920 'a+-b\x00c',
921 'a+-b\x00c\x80',
922 'a+-b\x00c\x80d',
923 'a+-b\x00c\x80d',
924 'a+-b\x00c\x80d',
925 'a+-b\x00c\x80d',
926 'a+-b\x00c\x80d',
927 'a+-b\x00c\x80d\u0100',
928 'a+-b\x00c\x80d\u0100e',
929 'a+-b\x00c\x80d\u0100e',
930 'a+-b\x00c\x80d\u0100e',
931 'a+-b\x00c\x80d\u0100e',
932 'a+-b\x00c\x80d\u0100e',
933 'a+-b\x00c\x80d\u0100e',
934 'a+-b\x00c\x80d\u0100e',
935 'a+-b\x00c\x80d\u0100e',
936 'a+-b\x00c\x80d\u0100e\U00010000',
937 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000938 ]
939 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000940
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300941 def test_errors(self):
942 tests = [
943 (b'a\xffb', 'a\ufffdb'),
944 (b'a+IK', 'a\ufffd'),
945 (b'a+IK-b', 'a\ufffdb'),
946 (b'a+IK,b', 'a\ufffdb'),
947 (b'a+IKx', 'a\u20ac\ufffd'),
948 (b'a+IKx-b', 'a\u20ac\ufffdb'),
949 (b'a+IKwgr', 'a\u20ac\ufffd'),
950 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
951 (b'a+IKwgr,', 'a\u20ac\ufffd'),
952 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
953 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
954 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
955 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
956 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
957 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
958 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
959 ]
960 for raw, expected in tests:
961 with self.subTest(raw=raw):
962 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
963 raw, 'strict', True)
964 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
965
966 def test_nonbmp(self):
967 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
968 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
969 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
970
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200971 test_lone_surrogates = None
972
973
Walter Dörwalde22d3392005-11-17 08:52:34 +0000974class UTF16ExTest(unittest.TestCase):
975
976 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000977 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000978
979 def test_bad_args(self):
980 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
981
982class ReadBufferTest(unittest.TestCase):
983
984 def test_array(self):
985 import array
986 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000987 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000988 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000989 )
990
991 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000992 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000993
994 def test_bad_args(self):
995 self.assertRaises(TypeError, codecs.readbuffer_encode)
996 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
997
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200998class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000999 encoding = "utf-8-sig"
1000
1001 def test_partial(self):
1002 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001003 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001004 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 "",
1006 "",
1007 "", # First BOM has been read and skipped
1008 "",
1009 "",
1010 "\ufeff", # Second BOM has been read and emitted
1011 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +00001012 "\ufeff\x00", # First byte of encoded "\xff" read
1013 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1014 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1015 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001016 "\ufeff\x00\xff\u07ff",
1017 "\ufeff\x00\xff\u07ff",
1018 "\ufeff\x00\xff\u07ff\u0800",
1019 "\ufeff\x00\xff\u07ff\u0800",
1020 "\ufeff\x00\xff\u07ff\u0800",
1021 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02001022 "\ufeff\x00\xff\u07ff\u0800\uffff",
1023 "\ufeff\x00\xff\u07ff\u0800\uffff",
1024 "\ufeff\x00\xff\u07ff\u0800\uffff",
1025 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001026 ]
1027 )
1028
Thomas Wouters89f507f2006-12-13 04:49:30 +00001029 def test_bug1601501(self):
1030 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +00001031 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001032
Walter Dörwald3abcb012007-04-16 22:10:50 +00001033 def test_bom(self):
1034 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001035 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001036 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1037
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001038 def test_stream_bom(self):
1039 unistring = "ABC\u00A1\u2200XYZ"
1040 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1041
1042 reader = codecs.getreader("utf-8-sig")
1043 for sizehint in [None] + list(range(1, 11)) + \
1044 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001045 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001046 ostream = io.StringIO()
1047 while 1:
1048 if sizehint is not None:
1049 data = istream.read(sizehint)
1050 else:
1051 data = istream.read()
1052
1053 if not data:
1054 break
1055 ostream.write(data)
1056
1057 got = ostream.getvalue()
1058 self.assertEqual(got, unistring)
1059
1060 def test_stream_bare(self):
1061 unistring = "ABC\u00A1\u2200XYZ"
1062 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1063
1064 reader = codecs.getreader("utf-8-sig")
1065 for sizehint in [None] + list(range(1, 11)) + \
1066 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001067 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001068 ostream = io.StringIO()
1069 while 1:
1070 if sizehint is not None:
1071 data = istream.read(sizehint)
1072 else:
1073 data = istream.read()
1074
1075 if not data:
1076 break
1077 ostream.write(data)
1078
1079 got = ostream.getvalue()
1080 self.assertEqual(got, unistring)
1081
1082class EscapeDecodeTest(unittest.TestCase):
1083 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001084 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001085
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001086 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001087 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001088 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001089 b = bytes([b])
1090 if b != b'\\':
1091 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001092
1093 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001094 decode = codecs.escape_decode
1095 check = coding_checker(self, decode)
1096 check(b"[\\\n]", b"[]")
1097 check(br'[\"]', b'["]')
1098 check(br"[\']", b"[']")
1099 check(br"[\\]", br"[\]")
1100 check(br"[\a]", b"[\x07]")
1101 check(br"[\b]", b"[\x08]")
1102 check(br"[\t]", b"[\x09]")
1103 check(br"[\n]", b"[\x0a]")
1104 check(br"[\v]", b"[\x0b]")
1105 check(br"[\f]", b"[\x0c]")
1106 check(br"[\r]", b"[\x0d]")
1107 check(br"[\7]", b"[\x07]")
1108 check(br"[\8]", br"[\8]")
1109 check(br"[\78]", b"[\x078]")
1110 check(br"[\41]", b"[!]")
1111 check(br"[\418]", b"[!8]")
1112 check(br"[\101]", b"[A]")
1113 check(br"[\1010]", b"[A0]")
1114 check(br"[\501]", b"[A]")
1115 check(br"[\x41]", b"[A]")
1116 check(br"[\X41]", br"[\X41]")
1117 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001118 for b in range(256):
1119 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001120 b = bytes([b])
1121 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001122
1123 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001124 decode = codecs.escape_decode
1125 self.assertRaises(ValueError, decode, br"\x")
1126 self.assertRaises(ValueError, decode, br"[\x]")
1127 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1128 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1129 self.assertRaises(ValueError, decode, br"\x0")
1130 self.assertRaises(ValueError, decode, br"[\x0]")
1131 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1132 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001133
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001134class RecodingTest(unittest.TestCase):
1135 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001136 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001137 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001138 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001139 f2.close()
1140 # Python used to crash on this at exit because of a refcount
1141 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001142
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143# From RFC 3492
1144punycode_testcases = [
1145 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1147 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001148 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001150 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001151 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001154 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001155 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1157 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1158 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001159 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001160 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1162 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1163 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001164 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001165 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001166 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001167 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1168 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1169 "\u0939\u0948\u0902",
1170 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001171
1172 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001173 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001174 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1175 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001176
1177 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001178 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1179 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1180 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001181 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1182 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183
1184 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1186 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1187 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1188 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001189 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001190
1191 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1193 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1194 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1195 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1196 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001197 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001198
1199 # (K) Vietnamese:
1200 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1201 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001202 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1203 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1204 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1205 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001206 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001209 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001210 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001211
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001213 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1214 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1215 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001216 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217
1218 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001219 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1220 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1221 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001222 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223
1224 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001225 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001226 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227
1228 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001229 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1230 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001231 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001232
1233 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001234 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001235 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236
1237 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001238 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001239 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240
1241 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001242 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1243 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001244 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 ]
1246
1247for i in punycode_testcases:
1248 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001249 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001250
1251class PunycodeTest(unittest.TestCase):
1252 def test_encode(self):
1253 for uni, puny in punycode_testcases:
1254 # Need to convert both strings to lower case, since
1255 # some of the extended encodings use upper case, but our
1256 # code produces only lower case. Converting just puny to
1257 # lower is also insufficient, since some of the input characters
1258 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001259 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001260 str(uni.encode("punycode"), "ascii").lower(),
1261 str(puny, "ascii").lower()
1262 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263
1264 def test_decode(self):
1265 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001266 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001267 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001268 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001269
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001270class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001271 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001272 def test_bug1251300(self):
1273 # Decoding with unicode_internal used to not correctly handle "code
1274 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001275 ok = [
1276 (b"\x00\x10\xff\xff", "\U0010ffff"),
1277 (b"\x00\x00\x01\x01", "\U00000101"),
1278 (b"", ""),
1279 ]
1280 not_ok = [
1281 b"\x7f\xff\xff\xff",
1282 b"\x80\x00\x00\x00",
1283 b"\x81\x00\x00\x00",
1284 b"\x00",
1285 b"\x00\x00\x00\x00\x00",
1286 ]
1287 for internal, uni in ok:
1288 if sys.byteorder == "little":
1289 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001290 with support.check_warnings():
1291 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001292 for internal in not_ok:
1293 if sys.byteorder == "little":
1294 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001295 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001296 'deprecated', DeprecationWarning)):
1297 self.assertRaises(UnicodeDecodeError, internal.decode,
1298 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001299 if sys.byteorder == "little":
1300 invalid = b"\x00\x00\x11\x00"
1301 else:
1302 invalid = b"\x00\x11\x00\x00"
1303 with support.check_warnings():
1304 self.assertRaises(UnicodeDecodeError,
1305 invalid.decode, "unicode_internal")
1306 with support.check_warnings():
1307 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1308 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001309
Victor Stinner182d90d2011-09-29 19:53:55 +02001310 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001311 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001312 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001313 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001314 'deprecated', DeprecationWarning)):
1315 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001316 except UnicodeDecodeError as ex:
1317 self.assertEqual("unicode_internal", ex.encoding)
1318 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1319 self.assertEqual(4, ex.start)
1320 self.assertEqual(8, ex.end)
1321 else:
1322 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001323
Victor Stinner182d90d2011-09-29 19:53:55 +02001324 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001325 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001326 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1327 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001328 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001329 'deprecated', DeprecationWarning)):
1330 ab = "ab".encode("unicode_internal").decode()
1331 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1332 "ascii"),
1333 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001334 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001335
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001336 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001337 with support.check_warnings(('unicode_internal codec has been '
1338 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001339 # Issue 3739
1340 encoder = codecs.getencoder("unicode_internal")
1341 self.assertEqual(encoder("a")[1], 1)
1342 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1343
1344 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001345
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1347nameprep_tests = [
1348 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001349 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1350 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1351 b'\xb8\x8f\xef\xbb\xbf',
1352 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001354 (b'CAFE',
1355 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001356 # 3.3 Case folding 8bit U+00DF (german sharp s).
1357 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001358 (b'\xc3\x9f',
1359 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001360 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'\xc4\xb0',
1362 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001364 (b'\xc5\x83\xcd\xba',
1365 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1367 # XXX: skip this as it fails in UCS-2 mode
1368 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1369 # 'telc\xe2\x88\x95kg\xcf\x83'),
1370 (None, None),
1371 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1373 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001375 (b'\xe1\xbe\xb7',
1376 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377 # 3.9 Self-reverting case folding U+01F0 and normalization.
1378 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001379 (b'\xc7\xb0',
1380 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001381 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001382 (b'\xce\x90',
1383 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001384 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001385 (b'\xce\xb0',
1386 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001387 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001388 (b'\xe1\xba\x96',
1389 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001390 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001391 (b'\xe1\xbd\x96',
1392 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b' ',
1395 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'\xc2\xa0',
1398 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 None),
1402 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001403 (b'\xe2\x80\x80',
1404 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001406 (b'\xe2\x80\x8b',
1407 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 (b'\xe3\x80\x80',
1410 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001412 (b'\x10\x7f',
1413 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001415 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001416 None),
1417 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001418 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001419 None),
1420 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001421 (b'\xef\xbb\xbf',
1422 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001424 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001425 None),
1426 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001427 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001428 None),
1429 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001430 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001431 None),
1432 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001433 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001434 None),
1435 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001436 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 None),
1438 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001439 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001440 None),
1441 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001442 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001443 None),
1444 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001445 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001446 None),
1447 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001448 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001449 None),
1450 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001451 (b'\xcd\x81',
1452 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001453 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001454 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001455 None),
1456 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001457 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001458 None),
1459 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001460 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001461 None),
1462 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001463 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001464 None),
1465 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001466 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001467 None),
1468 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001469 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001470 None),
1471 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001472 (b'foo\xef\xb9\xb6bar',
1473 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001474 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001475 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001476 None),
1477 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001478 (b'\xd8\xa71\xd8\xa8',
1479 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001480 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001481 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001482 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001483 # None),
1484 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001485 # 3.44 Larger test (shrinking).
1486 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001487 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1488 b'\xaa\xce\xb0\xe2\x80\x80',
1489 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001490 # 3.45 Larger test (expanding).
1491 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001492 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1493 b'\x80',
1494 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1495 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1496 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001497 ]
1498
1499
1500class NameprepTest(unittest.TestCase):
1501 def test_nameprep(self):
1502 from encodings.idna import nameprep
1503 for pos, (orig, prepped) in enumerate(nameprep_tests):
1504 if orig is None:
1505 # Skipped
1506 continue
1507 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001508 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001509 if prepped is None:
1510 # Input contains prohibited characters
1511 self.assertRaises(UnicodeError, nameprep, orig)
1512 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001513 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001514 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001515 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001516 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001517 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001518
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001519class IDNACodecTest(unittest.TestCase):
1520 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001521 self.assertEqual(str(b"python.org", "idna"), "python.org")
1522 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1523 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1524 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001525
1526 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001527 self.assertEqual("python.org".encode("idna"), b"python.org")
1528 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1529 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1530 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001531
Martin v. Löwis8b595142005-08-25 11:03:38 +00001532 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001533 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001534 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001535 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001536
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001537 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001539 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001540 "python.org"
1541 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001542 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001543 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001544 "python.org."
1545 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001546 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001547 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001548 "pyth\xf6n.org."
1549 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001550 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001551 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001552 "pyth\xf6n.org."
1553 )
1554
1555 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001556 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1557 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1558 self.assertEqual(decoder.decode(b"rg"), "")
1559 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001560
1561 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001562 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1563 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1564 self.assertEqual(decoder.decode(b"rg."), "org.")
1565 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001566
1567 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001568 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001569 b"".join(codecs.iterencode("python.org", "idna")),
1570 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001571 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001572 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001573 b"".join(codecs.iterencode("python.org.", "idna")),
1574 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001575 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001576 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001577 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1578 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001579 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001581 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1582 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001583 )
1584
1585 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001586 self.assertEqual(encoder.encode("\xe4x"), b"")
1587 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1588 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001589
1590 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001591 self.assertEqual(encoder.encode("\xe4x"), b"")
1592 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1593 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001594
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001595class CodecsModuleTest(unittest.TestCase):
1596
1597 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001598 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1599 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001600 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001601 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001602 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001603
Victor Stinnera57dfd02014-05-14 17:13:14 +02001604 # test keywords
1605 self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1606 '\xe4\xf6\xfc')
1607 self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1608 '[]')
1609
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001610 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001611 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1612 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001613 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001614 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001615 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001616 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001617
Victor Stinnera57dfd02014-05-14 17:13:14 +02001618 # test keywords
1619 self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1620 b'\xe4\xf6\xfc')
1621 self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1622 b'[]')
1623
Walter Dörwald063e1e82004-10-28 13:04:26 +00001624 def test_register(self):
1625 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001626 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001627
1628 def test_lookup(self):
1629 self.assertRaises(TypeError, codecs.lookup)
1630 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001631 self.assertRaises(LookupError, codecs.lookup, " ")
1632
1633 def test_getencoder(self):
1634 self.assertRaises(TypeError, codecs.getencoder)
1635 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1636
1637 def test_getdecoder(self):
1638 self.assertRaises(TypeError, codecs.getdecoder)
1639 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1640
1641 def test_getreader(self):
1642 self.assertRaises(TypeError, codecs.getreader)
1643 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1644
1645 def test_getwriter(self):
1646 self.assertRaises(TypeError, codecs.getwriter)
1647 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001648
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001649 def test_lookup_issue1813(self):
1650 # Issue #1813: under Turkish locales, lookup of some codecs failed
1651 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001652 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001653 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1654 try:
1655 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1656 except locale.Error:
1657 # Unsupported locale on this system
1658 self.skipTest('test needs Turkish locale')
1659 c = codecs.lookup('ASCII')
1660 self.assertEqual(c.name, 'ascii')
1661
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001662class StreamReaderTest(unittest.TestCase):
1663
1664 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001665 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001666 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001667
1668 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001669 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001670 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001671
Thomas Wouters89f507f2006-12-13 04:49:30 +00001672class EncodedFileTest(unittest.TestCase):
1673
1674 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001675 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001676 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001677 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001678
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001679 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001680 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001681 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001682 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001683
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001684all_unicode_encodings = [
1685 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001686 "big5",
1687 "big5hkscs",
1688 "charmap",
1689 "cp037",
1690 "cp1006",
1691 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001692 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001693 "cp1140",
1694 "cp1250",
1695 "cp1251",
1696 "cp1252",
1697 "cp1253",
1698 "cp1254",
1699 "cp1255",
1700 "cp1256",
1701 "cp1257",
1702 "cp1258",
1703 "cp424",
1704 "cp437",
1705 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001706 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001707 "cp737",
1708 "cp775",
1709 "cp850",
1710 "cp852",
1711 "cp855",
1712 "cp856",
1713 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001714 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001715 "cp860",
1716 "cp861",
1717 "cp862",
1718 "cp863",
1719 "cp864",
1720 "cp865",
1721 "cp866",
1722 "cp869",
1723 "cp874",
1724 "cp875",
1725 "cp932",
1726 "cp949",
1727 "cp950",
1728 "euc_jis_2004",
1729 "euc_jisx0213",
1730 "euc_jp",
1731 "euc_kr",
1732 "gb18030",
1733 "gb2312",
1734 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001735 "hp_roman8",
1736 "hz",
1737 "idna",
1738 "iso2022_jp",
1739 "iso2022_jp_1",
1740 "iso2022_jp_2",
1741 "iso2022_jp_2004",
1742 "iso2022_jp_3",
1743 "iso2022_jp_ext",
1744 "iso2022_kr",
1745 "iso8859_1",
1746 "iso8859_10",
1747 "iso8859_11",
1748 "iso8859_13",
1749 "iso8859_14",
1750 "iso8859_15",
1751 "iso8859_16",
1752 "iso8859_2",
1753 "iso8859_3",
1754 "iso8859_4",
1755 "iso8859_5",
1756 "iso8859_6",
1757 "iso8859_7",
1758 "iso8859_8",
1759 "iso8859_9",
1760 "johab",
1761 "koi8_r",
1762 "koi8_u",
1763 "latin_1",
1764 "mac_cyrillic",
1765 "mac_greek",
1766 "mac_iceland",
1767 "mac_latin2",
1768 "mac_roman",
1769 "mac_turkish",
1770 "palmos",
1771 "ptcp154",
1772 "punycode",
1773 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001774 "shift_jis",
1775 "shift_jis_2004",
1776 "shift_jisx0213",
1777 "tis_620",
1778 "unicode_escape",
1779 "unicode_internal",
1780 "utf_16",
1781 "utf_16_be",
1782 "utf_16_le",
1783 "utf_7",
1784 "utf_8",
1785]
1786
1787if hasattr(codecs, "mbcs_encode"):
1788 all_unicode_encodings.append("mbcs")
1789
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001790# The following encoding is not tested, because it's not supposed
1791# to work:
1792# "undefined"
1793
1794# The following encodings don't work in stateful mode
1795broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001796 "punycode",
1797 "unicode_internal"
1798]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001799broken_incremental_coders = broken_unicode_with_streams + [
1800 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001801]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001802
Walter Dörwald3abcb012007-04-16 22:10:50 +00001803class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001804 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001805 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001806 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001807 name = codecs.lookup(encoding).name
1808 if encoding.endswith("_codec"):
1809 name += "_codec"
1810 elif encoding == "latin_1":
1811 name = "latin_1"
1812 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001813
Ezio Melottiadc417c2011-11-17 12:23:34 +02001814 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001815 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001816 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001817 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001818 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001819 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001820
1821 if encoding not in broken_unicode_with_streams:
1822 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001823 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001824 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001825 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001826 for c in s:
1827 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001828 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001829 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001830 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001831 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001832 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001833 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001834 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001835 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001836 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001837 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001838
Thomas Wouters89f507f2006-12-13 04:49:30 +00001839 if encoding not in broken_incremental_coders:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001840 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001841 try:
1842 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001843 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001844 pass
1845 else:
1846 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001847 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001848 for c in s:
1849 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001850 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001851 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001852 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001853 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001854 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001855 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001856 self.assertEqual(decodedresult, s,
1857 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001858
1859 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001860 result = "".join(codecs.iterdecode(
1861 codecs.iterencode(s, encoding), encoding))
1862 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001863
1864 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001865 result = "".join(codecs.iterdecode(
1866 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001867 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001868
Victor Stinner554f3f02010-06-16 23:33:54 +00001869 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001870 # check incremental decoder/encoder with errors argument
1871 try:
1872 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001873 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001874 pass
1875 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001876 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001877 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001878 decodedresult = "".join(decoder.decode(bytes([c]))
1879 for c in encodedresult)
1880 self.assertEqual(decodedresult, s,
1881 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001882
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001883 @support.cpython_only
1884 def test_basics_capi(self):
1885 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1886 s = "abc123" # all codecs should be able to encode these
1887 for encoding in all_unicode_encodings:
1888 if encoding not in broken_incremental_coders:
1889 # check incremental decoder/encoder (fetched via the C API)
1890 try:
1891 cencoder = codec_incrementalencoder(encoding)
1892 except LookupError: # no IncrementalEncoder
1893 pass
1894 else:
1895 # check C API
1896 encodedresult = b""
1897 for c in s:
1898 encodedresult += cencoder.encode(c)
1899 encodedresult += cencoder.encode("", True)
1900 cdecoder = codec_incrementaldecoder(encoding)
1901 decodedresult = ""
1902 for c in encodedresult:
1903 decodedresult += cdecoder.decode(bytes([c]))
1904 decodedresult += cdecoder.decode(b"", True)
1905 self.assertEqual(decodedresult, s,
1906 "encoding=%r" % encoding)
1907
1908 if encoding not in ("idna", "mbcs"):
1909 # check incremental decoder/encoder with errors argument
1910 try:
1911 cencoder = codec_incrementalencoder(encoding, "ignore")
1912 except LookupError: # no IncrementalEncoder
1913 pass
1914 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001915 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001916 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1917 decodedresult = "".join(cdecoder.decode(bytes([c]))
1918 for c in encodedresult)
1919 self.assertEqual(decodedresult, s,
1920 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001921
Walter Dörwald729c31f2005-03-14 19:06:30 +00001922 def test_seek(self):
1923 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001924 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001925 for encoding in all_unicode_encodings:
1926 if encoding == "idna": # FIXME: See SF bug #1163178
1927 continue
1928 if encoding in broken_unicode_with_streams:
1929 continue
Victor Stinner05010702011-05-27 16:50:40 +02001930 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001931 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001932 # Test that calling seek resets the internal codec state and buffers
1933 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001934 data = reader.read()
1935 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001936
Walter Dörwalde22d3392005-11-17 08:52:34 +00001937 def test_bad_decode_args(self):
1938 for encoding in all_unicode_encodings:
1939 decoder = codecs.getdecoder(encoding)
1940 self.assertRaises(TypeError, decoder)
1941 if encoding not in ("idna", "punycode"):
1942 self.assertRaises(TypeError, decoder, 42)
1943
1944 def test_bad_encode_args(self):
1945 for encoding in all_unicode_encodings:
1946 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001947 with support.check_warnings():
1948 # unicode-internal has been deprecated
1949 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001950
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001951 def test_encoding_map_type_initialized(self):
1952 from encodings import cp1140
1953 # This used to crash, we are only verifying there's no crash.
1954 table_type = type(cp1140.encoding_table)
1955 self.assertEqual(table_type, table_type)
1956
Walter Dörwald3abcb012007-04-16 22:10:50 +00001957 def test_decoder_state(self):
1958 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001959 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001960 for encoding in all_unicode_encodings:
1961 if encoding not in broken_incremental_coders:
1962 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1963 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1964
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001965class CharmapTest(unittest.TestCase):
1966 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001967 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001968 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001969 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001970 )
1971
Ezio Melottib3aedd42010-11-20 19:04:17 +00001972 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001973 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1974 ("\U0010FFFFbc", 3)
1975 )
1976
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001977 self.assertRaises(UnicodeDecodeError,
1978 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1979 )
1980
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001981 self.assertRaises(UnicodeDecodeError,
1982 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1983 )
1984
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001985 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001986 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001987 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001988 )
1989
Ezio Melottib3aedd42010-11-20 19:04:17 +00001990 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001991 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001992 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001993 )
1994
Ezio Melottib3aedd42010-11-20 19:04:17 +00001995 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001996 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001997 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001998 )
1999
Ezio Melottib3aedd42010-11-20 19:04:17 +00002000 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00002001 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002002 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002003 )
2004
Guido van Rossum805365e2007-05-07 22:24:25 +00002005 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00002006 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00002007 codecs.charmap_decode(allbytes, "ignore", ""),
2008 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002009 )
2010
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002011 def test_decode_with_int2str_map(self):
2012 self.assertEqual(
2013 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2014 {0: 'a', 1: 'b', 2: 'c'}),
2015 ("abc", 3)
2016 )
2017
2018 self.assertEqual(
2019 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2020 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2021 ("AaBbCc", 3)
2022 )
2023
2024 self.assertEqual(
2025 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2026 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2027 ("\U0010FFFFbc", 3)
2028 )
2029
2030 self.assertEqual(
2031 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2032 {0: 'a', 1: 'b', 2: ''}),
2033 ("ab", 3)
2034 )
2035
2036 self.assertRaises(UnicodeDecodeError,
2037 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2038 {0: 'a', 1: 'b'}
2039 )
2040
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002041 self.assertRaises(UnicodeDecodeError,
2042 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2043 {0: 'a', 1: 'b', 2: None}
2044 )
2045
2046 # Issue #14850
2047 self.assertRaises(UnicodeDecodeError,
2048 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2049 {0: 'a', 1: 'b', 2: '\ufffe'}
2050 )
2051
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002052 self.assertEqual(
2053 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2054 {0: 'a', 1: 'b'}),
2055 ("ab\ufffd", 3)
2056 )
2057
2058 self.assertEqual(
2059 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2060 {0: 'a', 1: 'b', 2: None}),
2061 ("ab\ufffd", 3)
2062 )
2063
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002064 # Issue #14850
2065 self.assertEqual(
2066 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2067 {0: 'a', 1: 'b', 2: '\ufffe'}),
2068 ("ab\ufffd", 3)
2069 )
2070
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002071 self.assertEqual(
2072 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2073 {0: 'a', 1: 'b'}),
2074 ("ab", 3)
2075 )
2076
2077 self.assertEqual(
2078 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2079 {0: 'a', 1: 'b', 2: None}),
2080 ("ab", 3)
2081 )
2082
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002083 # Issue #14850
2084 self.assertEqual(
2085 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2086 {0: 'a', 1: 'b', 2: '\ufffe'}),
2087 ("ab", 3)
2088 )
2089
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002090 allbytes = bytes(range(256))
2091 self.assertEqual(
2092 codecs.charmap_decode(allbytes, "ignore", {}),
2093 ("", len(allbytes))
2094 )
2095
2096 def test_decode_with_int2int_map(self):
2097 a = ord('a')
2098 b = ord('b')
2099 c = ord('c')
2100
2101 self.assertEqual(
2102 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2103 {0: a, 1: b, 2: c}),
2104 ("abc", 3)
2105 )
2106
2107 # Issue #15379
2108 self.assertEqual(
2109 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2110 {0: 0x10FFFF, 1: b, 2: c}),
2111 ("\U0010FFFFbc", 3)
2112 )
2113
Antoine Pitroua1f76552012-09-23 20:00:04 +02002114 self.assertEqual(
2115 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2116 {0: sys.maxunicode, 1: b, 2: c}),
2117 (chr(sys.maxunicode) + "bc", 3)
2118 )
2119
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002120 self.assertRaises(TypeError,
2121 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002122 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002123 )
2124
2125 self.assertRaises(UnicodeDecodeError,
2126 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2127 {0: a, 1: b},
2128 )
2129
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002130 self.assertRaises(UnicodeDecodeError,
2131 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2132 {0: a, 1: b, 2: 0xFFFE},
2133 )
2134
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002135 self.assertEqual(
2136 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2137 {0: a, 1: b}),
2138 ("ab\ufffd", 3)
2139 )
2140
2141 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002142 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2143 {0: a, 1: b, 2: 0xFFFE}),
2144 ("ab\ufffd", 3)
2145 )
2146
2147 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002148 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2149 {0: a, 1: b}),
2150 ("ab", 3)
2151 )
2152
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002153 self.assertEqual(
2154 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2155 {0: a, 1: b, 2: 0xFFFE}),
2156 ("ab", 3)
2157 )
2158
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002159
Thomas Wouters89f507f2006-12-13 04:49:30 +00002160class WithStmtTest(unittest.TestCase):
2161 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002162 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002163 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2164 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002165
2166 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002167 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002168 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002169 with codecs.StreamReaderWriter(f, info.streamreader,
2170 info.streamwriter, 'strict') as srw:
2171 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002172
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002173class TypesTest(unittest.TestCase):
2174 def test_decode_unicode(self):
2175 # Most decoders don't accept unicode input
2176 decoders = [
2177 codecs.utf_7_decode,
2178 codecs.utf_8_decode,
2179 codecs.utf_16_le_decode,
2180 codecs.utf_16_be_decode,
2181 codecs.utf_16_ex_decode,
2182 codecs.utf_32_decode,
2183 codecs.utf_32_le_decode,
2184 codecs.utf_32_be_decode,
2185 codecs.utf_32_ex_decode,
2186 codecs.latin_1_decode,
2187 codecs.ascii_decode,
2188 codecs.charmap_decode,
2189 ]
2190 if hasattr(codecs, "mbcs_decode"):
2191 decoders.append(codecs.mbcs_decode)
2192 for decoder in decoders:
2193 self.assertRaises(TypeError, decoder, "xxx")
2194
2195 def test_unicode_escape(self):
2196 # Escape-decoding an unicode string is supported ang gives the same
2197 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002198 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2199 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2200 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2201 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002202
Victor Stinnere3b47152011-12-09 20:49:49 +01002203 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2204 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2205
2206 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2207 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2208
Serhiy Storchakad6793772013-01-29 10:20:44 +02002209
2210class UnicodeEscapeTest(unittest.TestCase):
2211 def test_empty(self):
2212 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2213 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2214
2215 def test_raw_encode(self):
2216 encode = codecs.unicode_escape_encode
2217 for b in range(32, 127):
2218 if b != b'\\'[0]:
2219 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2220
2221 def test_raw_decode(self):
2222 decode = codecs.unicode_escape_decode
2223 for b in range(256):
2224 if b != b'\\'[0]:
2225 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2226
2227 def test_escape_encode(self):
2228 encode = codecs.unicode_escape_encode
2229 check = coding_checker(self, encode)
2230 check('\t', br'\t')
2231 check('\n', br'\n')
2232 check('\r', br'\r')
2233 check('\\', br'\\')
2234 for b in range(32):
2235 if chr(b) not in '\t\n\r':
2236 check(chr(b), ('\\x%02x' % b).encode())
2237 for b in range(127, 256):
2238 check(chr(b), ('\\x%02x' % b).encode())
2239 check('\u20ac', br'\u20ac')
2240 check('\U0001d120', br'\U0001d120')
2241
2242 def test_escape_decode(self):
2243 decode = codecs.unicode_escape_decode
2244 check = coding_checker(self, decode)
2245 check(b"[\\\n]", "[]")
2246 check(br'[\"]', '["]')
2247 check(br"[\']", "[']")
2248 check(br"[\\]", r"[\]")
2249 check(br"[\a]", "[\x07]")
2250 check(br"[\b]", "[\x08]")
2251 check(br"[\t]", "[\x09]")
2252 check(br"[\n]", "[\x0a]")
2253 check(br"[\v]", "[\x0b]")
2254 check(br"[\f]", "[\x0c]")
2255 check(br"[\r]", "[\x0d]")
2256 check(br"[\7]", "[\x07]")
2257 check(br"[\8]", r"[\8]")
2258 check(br"[\78]", "[\x078]")
2259 check(br"[\41]", "[!]")
2260 check(br"[\418]", "[!8]")
2261 check(br"[\101]", "[A]")
2262 check(br"[\1010]", "[A0]")
2263 check(br"[\x41]", "[A]")
2264 check(br"[\x410]", "[A0]")
2265 check(br"\u20ac", "\u20ac")
2266 check(br"\U0001d120", "\U0001d120")
2267 for b in range(256):
2268 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2269 check(b'\\' + bytes([b]), '\\' + chr(b))
2270
2271 def test_decode_errors(self):
2272 decode = codecs.unicode_escape_decode
2273 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2274 for i in range(d):
2275 self.assertRaises(UnicodeDecodeError, decode,
2276 b"\\" + c + b"0"*i)
2277 self.assertRaises(UnicodeDecodeError, decode,
2278 b"[\\" + c + b"0"*i + b"]")
2279 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2280 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2281 self.assertEqual(decode(data, "replace"),
2282 ("[\ufffd]\ufffd", len(data)))
2283 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2284 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2285 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2286
2287
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002288class RawUnicodeEscapeTest(unittest.TestCase):
2289 def test_empty(self):
2290 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2291 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2292
2293 def test_raw_encode(self):
2294 encode = codecs.raw_unicode_escape_encode
2295 for b in range(256):
2296 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2297
2298 def test_raw_decode(self):
2299 decode = codecs.raw_unicode_escape_decode
2300 for b in range(256):
2301 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2302
2303 def test_escape_encode(self):
2304 encode = codecs.raw_unicode_escape_encode
2305 check = coding_checker(self, encode)
2306 for b in range(256):
2307 if b not in b'uU':
2308 check('\\' + chr(b), b'\\' + bytes([b]))
2309 check('\u20ac', br'\u20ac')
2310 check('\U0001d120', br'\U0001d120')
2311
2312 def test_escape_decode(self):
2313 decode = codecs.raw_unicode_escape_decode
2314 check = coding_checker(self, decode)
2315 for b in range(256):
2316 if b not in b'uU':
2317 check(b'\\' + bytes([b]), '\\' + chr(b))
2318 check(br"\u20ac", "\u20ac")
2319 check(br"\U0001d120", "\U0001d120")
2320
2321 def test_decode_errors(self):
2322 decode = codecs.raw_unicode_escape_decode
2323 for c, d in (b'u', 4), (b'U', 4):
2324 for i in range(d):
2325 self.assertRaises(UnicodeDecodeError, decode,
2326 b"\\" + c + b"0"*i)
2327 self.assertRaises(UnicodeDecodeError, decode,
2328 b"[\\" + c + b"0"*i + b"]")
2329 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2330 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2331 self.assertEqual(decode(data, "replace"),
2332 ("[\ufffd]\ufffd", len(data)))
2333 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2334 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2335 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2336
2337
Martin v. Löwis43c57782009-05-10 08:15:24 +00002338class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002339
2340 def test_utf8(self):
2341 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002342 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002343 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002344 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002345 b"foo\x80bar")
2346 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002347 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002348 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002349 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002350 b"\xed\xb0\x80")
2351
2352 def test_ascii(self):
2353 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002354 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002355 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002356 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002357 b"foo\x80bar")
2358
2359 def test_charmap(self):
2360 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002361 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002362 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002363 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002364 b"foo\xa5bar")
2365
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002366 def test_latin1(self):
2367 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002368 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002369 b"\xe4\xeb\xef\xf6\xfc")
2370
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002371
Victor Stinner3fed0872010-05-22 02:16:27 +00002372class BomTest(unittest.TestCase):
2373 def test_seek0(self):
2374 data = "1234567890"
2375 tests = ("utf-16",
2376 "utf-16-le",
2377 "utf-16-be",
2378 "utf-32",
2379 "utf-32-le",
2380 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002381 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002382 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002383 # Check if the BOM is written only once
2384 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002385 f.write(data)
2386 f.write(data)
2387 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002388 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002389 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002390 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002391
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002392 # Check that the BOM is written after a seek(0)
2393 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2394 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002395 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002396 f.seek(0)
2397 f.write(data)
2398 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002399 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002400
2401 # (StreamWriter) Check that the BOM is written after a seek(0)
2402 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002403 f.writer.write(data[0])
2404 self.assertNotEqual(f.writer.tell(), 0)
2405 f.writer.seek(0)
2406 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002407 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002408 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002409
Victor Stinner05010702011-05-27 16:50:40 +02002410 # Check that the BOM is not written after a seek() at a position
2411 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002412 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2413 f.write(data)
2414 f.seek(f.tell())
2415 f.write(data)
2416 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002417 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002418
Victor Stinner05010702011-05-27 16:50:40 +02002419 # (StreamWriter) Check that the BOM is not written after a seek()
2420 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002421 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002422 f.writer.write(data)
2423 f.writer.seek(f.writer.tell())
2424 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002425 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002426 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002427
Victor Stinner3fed0872010-05-22 02:16:27 +00002428
Georg Brandl02524622010-12-02 18:06:51 +00002429bytes_transform_encodings = [
2430 "base64_codec",
2431 "uu_codec",
2432 "quopri_codec",
2433 "hex_codec",
2434]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002435
2436transform_aliases = {
2437 "base64_codec": ["base64", "base_64"],
2438 "uu_codec": ["uu"],
2439 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2440 "hex_codec": ["hex"],
2441 "rot_13": ["rot13"],
2442}
2443
Georg Brandl02524622010-12-02 18:06:51 +00002444try:
2445 import zlib
2446except ImportError:
Zachary Wareefa2e042013-12-30 14:54:11 -06002447 zlib = None
Georg Brandl02524622010-12-02 18:06:51 +00002448else:
2449 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002450 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002451try:
2452 import bz2
2453except ImportError:
2454 pass
2455else:
2456 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002457 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002458
2459class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002460
Georg Brandl02524622010-12-02 18:06:51 +00002461 def test_basics(self):
2462 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002463 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002464 with self.subTest(encoding=encoding):
2465 # generic codecs interface
2466 (o, size) = codecs.getencoder(encoding)(binput)
2467 self.assertEqual(size, len(binput))
2468 (i, size) = codecs.getdecoder(encoding)(o)
2469 self.assertEqual(size, len(o))
2470 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002471
Georg Brandl02524622010-12-02 18:06:51 +00002472 def test_read(self):
2473 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002474 with self.subTest(encoding=encoding):
2475 sin = codecs.encode(b"\x80", encoding)
2476 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2477 sout = reader.read()
2478 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002479
2480 def test_readline(self):
2481 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002482 with self.subTest(encoding=encoding):
2483 sin = codecs.encode(b"\x80", encoding)
2484 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2485 sout = reader.readline()
2486 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002487
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002488 def test_buffer_api_usage(self):
2489 # We check all the transform codecs accept memoryview input
2490 # for encoding and decoding
2491 # and also that they roundtrip correctly
2492 original = b"12345\x80"
2493 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002494 with self.subTest(encoding=encoding):
2495 data = original
2496 view = memoryview(data)
2497 data = codecs.encode(data, encoding)
2498 view_encoded = codecs.encode(view, encoding)
2499 self.assertEqual(view_encoded, data)
2500 view = memoryview(data)
2501 data = codecs.decode(data, encoding)
2502 self.assertEqual(data, original)
2503 view_decoded = codecs.decode(view, encoding)
2504 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002505
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002506 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002507 # Check binary -> binary codecs give a good error for str input
2508 bad_input = "bad input type"
2509 for encoding in bytes_transform_encodings:
2510 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002511 fmt = ( "{!r} is not a text encoding; "
2512 "use codecs.encode\(\) to handle arbitrary codecs")
2513 msg = fmt.format(encoding)
2514 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002515 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002516 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002517
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002518 def test_text_to_binary_blacklists_text_transforms(self):
2519 # Check str.encode gives a good error message for str -> str codecs
2520 msg = (r"^'rot_13' is not a text encoding; "
2521 "use codecs.encode\(\) to handle arbitrary codecs")
2522 with self.assertRaisesRegex(LookupError, msg):
2523 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002524
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002525 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002526 # Check bytes.decode and bytearray.decode give a good error
2527 # message for binary -> binary codecs
2528 data = b"encode first to ensure we meet any format restrictions"
2529 for encoding in bytes_transform_encodings:
2530 with self.subTest(encoding=encoding):
2531 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002532 fmt = (r"{!r} is not a text encoding; "
2533 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002534 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002535 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002536 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002537 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002538 bytearray(encoded_data).decode(encoding)
2539
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002540 def test_binary_to_text_blacklists_text_transforms(self):
2541 # Check str -> str codec gives a good error for binary input
2542 for bad_input in (b"immutable", bytearray(b"mutable")):
2543 with self.subTest(bad_input=bad_input):
2544 msg = (r"^'rot_13' is not a text encoding; "
2545 "use codecs.decode\(\) to handle arbitrary codecs")
2546 with self.assertRaisesRegex(LookupError, msg) as failure:
2547 bad_input.decode("rot_13")
2548 self.assertIsNone(failure.exception.__cause__)
2549
Zachary Wareefa2e042013-12-30 14:54:11 -06002550 @unittest.skipUnless(zlib, "Requires zlib support")
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002551 def test_custom_zlib_error_is_wrapped(self):
2552 # Check zlib codec gives a good error for malformed input
2553 msg = "^decoding with 'zlib_codec' codec failed"
2554 with self.assertRaisesRegex(Exception, msg) as failure:
2555 codecs.decode(b"hello", "zlib_codec")
2556 self.assertIsInstance(failure.exception.__cause__,
2557 type(failure.exception))
2558
2559 def test_custom_hex_error_is_wrapped(self):
2560 # Check hex codec gives a good error for malformed input
2561 msg = "^decoding with 'hex_codec' codec failed"
2562 with self.assertRaisesRegex(Exception, msg) as failure:
2563 codecs.decode(b"hello", "hex_codec")
2564 self.assertIsInstance(failure.exception.__cause__,
2565 type(failure.exception))
2566
2567 # Unfortunately, the bz2 module throws OSError, which the codec
2568 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002569
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002570 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2571 def test_aliases(self):
2572 for codec_name, aliases in transform_aliases.items():
2573 expected_name = codecs.lookup(codec_name).name
2574 for alias in aliases:
2575 with self.subTest(alias=alias):
2576 info = codecs.lookup(alias)
2577 self.assertEqual(info.name, expected_name)
2578
Serhiy Storchaka519114d2014-11-07 14:04:37 +02002579 def test_uu_invalid(self):
2580 # Missing "begin" line
2581 self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2582
Nick Coghlan8b097b42013-11-13 23:49:21 +10002583
2584# The codec system tries to wrap exceptions in order to ensure the error
2585# mentions the operation being performed and the codec involved. We
2586# currently *only* want this to happen for relatively stateless
2587# exceptions, where the only significant information they contain is their
2588# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002589
2590# Use a local codec registry to avoid appearing to leak objects when
2591# registering multiple seach functions
2592_TEST_CODECS = {}
2593
2594def _get_test_codec(codec_name):
2595 return _TEST_CODECS.get(codec_name)
2596codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2597
Nick Coghlan8fad1672014-09-15 23:50:44 +12002598try:
2599 # Issue #22166: Also need to clear the internal cache in CPython
2600 from _codecs import _forget_codec
2601except ImportError:
2602 def _forget_codec(codec_name):
2603 pass
2604
2605
Nick Coghlan8b097b42013-11-13 23:49:21 +10002606class ExceptionChainingTest(unittest.TestCase):
2607
2608 def setUp(self):
2609 # There's no way to unregister a codec search function, so we just
2610 # ensure we render this one fairly harmless after the test
2611 # case finishes by using the test case repr as the codec name
2612 # The codecs module normalizes codec names, although this doesn't
2613 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002614 # We also make sure we use a truly unique id for the custom codec
2615 # to avoid issues with the codec cache when running these tests
2616 # multiple times (e.g. when hunting for refleaks)
2617 unique_id = repr(self) + str(id(self))
2618 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2619
2620 # We store the object to raise on the instance because of a bad
2621 # interaction between the codec caching (which means we can't
2622 # recreate the codec entry) and regrtest refleak hunting (which
2623 # runs the same test instance multiple times). This means we
2624 # need to ensure the codecs call back in to the instance to find
2625 # out which exception to raise rather than binding them in a
2626 # closure to an object that may change on the next run
2627 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002628
Nick Coghlan4e553e22013-11-16 00:35:34 +10002629 def tearDown(self):
2630 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8fad1672014-09-15 23:50:44 +12002631 # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2632 encodings._cache.pop(self.codec_name, None)
2633 try:
2634 _forget_codec(self.codec_name)
2635 except KeyError:
2636 pass
Nick Coghlan8b097b42013-11-13 23:49:21 +10002637
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002638 def set_codec(self, encode, decode):
2639 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002640 name=self.codec_name)
2641 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002642
2643 @contextlib.contextmanager
2644 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002645 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002646 operation, self.codec_name, exc_type.__name__, msg)
2647 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2648 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002649 self.assertIsInstance(caught.exception.__cause__, exc_type)
Nick Coghlan77b286b2014-01-27 00:53:38 +10002650 self.assertIsNotNone(caught.exception.__cause__.__traceback__)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002651
2652 def raise_obj(self, *args, **kwds):
2653 # Helper to dynamically change the object raised by a test codec
2654 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002655
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002656 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002657 self.obj_to_raise = obj_to_raise
2658 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002659 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002660 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002661 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002662 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002663 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002664 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002665 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002666 codecs.decode(b"bytes input", self.codec_name)
2667
2668 def test_raise_by_type(self):
2669 self.check_wrapped(RuntimeError, "")
2670
2671 def test_raise_by_value(self):
2672 msg = "This should be wrapped"
2673 self.check_wrapped(RuntimeError(msg), msg)
2674
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002675 def test_raise_grandchild_subclass_exact_size(self):
2676 msg = "This should be wrapped"
2677 class MyRuntimeError(RuntimeError):
2678 __slots__ = ()
2679 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2680
2681 def test_raise_subclass_with_weakref_support(self):
2682 msg = "This should be wrapped"
2683 class MyRuntimeError(RuntimeError):
2684 pass
2685 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2686
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002687 def check_not_wrapped(self, obj_to_raise, msg):
2688 def raise_obj(*args, **kwds):
2689 raise obj_to_raise
2690 self.set_codec(raise_obj, raise_obj)
2691 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002692 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002693 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002694 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002695 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002696 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002697 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002698 codecs.decode(b"bytes input", self.codec_name)
2699
2700 def test_init_override_is_not_wrapped(self):
2701 class CustomInit(RuntimeError):
2702 def __init__(self):
2703 pass
2704 self.check_not_wrapped(CustomInit, "")
2705
2706 def test_new_override_is_not_wrapped(self):
2707 class CustomNew(RuntimeError):
2708 def __new__(cls):
2709 return super().__new__(cls)
2710 self.check_not_wrapped(CustomNew, "")
2711
2712 def test_instance_attribute_is_not_wrapped(self):
2713 msg = "This should NOT be wrapped"
2714 exc = RuntimeError(msg)
2715 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002716 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002717
2718 def test_non_str_arg_is_not_wrapped(self):
2719 self.check_not_wrapped(RuntimeError(1), "1")
2720
2721 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002722 msg_re = r"^\('a', 'b', 'c'\)$"
2723 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002724
2725 # http://bugs.python.org/issue19609
2726 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002727 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002728 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002729 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002730 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002731 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002732 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002733 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002734 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002735 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002736 codecs.decode(b"bytes input", self.codec_name)
2737
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002738 def test_unflagged_non_text_codec_handling(self):
2739 # The stdlib non-text codecs are now marked so they're
2740 # pre-emptively skipped by the text model related methods
2741 # However, third party codecs won't be flagged, so we still make
2742 # sure the case where an inappropriate output type is produced is
2743 # handled appropriately
2744 def encode_to_str(*args, **kwds):
2745 return "not bytes!", 0
2746 def decode_to_bytes(*args, **kwds):
2747 return b"not str!", 0
2748 self.set_codec(encode_to_str, decode_to_bytes)
2749 # No input or output type checks on the codecs module functions
2750 encoded = codecs.encode(None, self.codec_name)
2751 self.assertEqual(encoded, "not bytes!")
2752 decoded = codecs.decode(None, self.codec_name)
2753 self.assertEqual(decoded, b"not str!")
2754 # Text model methods should complain
2755 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2756 "use codecs.encode\(\) to encode to arbitrary types$")
2757 msg = fmt.format(self.codec_name)
2758 with self.assertRaisesRegex(TypeError, msg):
2759 "str_input".encode(self.codec_name)
2760 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2761 "use codecs.decode\(\) to decode to arbitrary types$")
2762 msg = fmt.format(self.codec_name)
2763 with self.assertRaisesRegex(TypeError, msg):
2764 b"bytes input".decode(self.codec_name)
2765
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002766
Georg Brandl02524622010-12-02 18:06:51 +00002767
Victor Stinner62be4fb2011-10-18 21:46:37 +02002768@unittest.skipUnless(sys.platform == 'win32',
2769 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002770class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002771 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002772 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002773
Victor Stinner3a50e702011-10-18 21:21:00 +02002774 def test_invalid_code_page(self):
2775 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2776 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002777 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2778 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002779
2780 def test_code_page_name(self):
2781 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2782 codecs.code_page_encode, 932, '\xff')
2783 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002784 codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002785 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
Victor Stinner7d00cc12014-03-17 23:08:06 +01002786 codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002787
2788 def check_decode(self, cp, tests):
2789 for raw, errors, expected in tests:
2790 if expected is not None:
2791 try:
Victor Stinner7d00cc12014-03-17 23:08:06 +01002792 decoded = codecs.code_page_decode(cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002793 except UnicodeDecodeError as err:
2794 self.fail('Unable to decode %a from "cp%s" with '
2795 'errors=%r: %s' % (raw, cp, errors, err))
2796 self.assertEqual(decoded[0], expected,
2797 '%a.decode("cp%s", %r)=%a != %a'
2798 % (raw, cp, errors, decoded[0], expected))
2799 # assert 0 <= decoded[1] <= len(raw)
2800 self.assertGreaterEqual(decoded[1], 0)
2801 self.assertLessEqual(decoded[1], len(raw))
2802 else:
2803 self.assertRaises(UnicodeDecodeError,
Victor Stinner7d00cc12014-03-17 23:08:06 +01002804 codecs.code_page_decode, cp, raw, errors, True)
Victor Stinner3a50e702011-10-18 21:21:00 +02002805
2806 def check_encode(self, cp, tests):
2807 for text, errors, expected in tests:
2808 if expected is not None:
2809 try:
2810 encoded = codecs.code_page_encode(cp, text, errors)
2811 except UnicodeEncodeError as err:
2812 self.fail('Unable to encode %a to "cp%s" with '
2813 'errors=%r: %s' % (text, cp, errors, err))
2814 self.assertEqual(encoded[0], expected,
2815 '%a.encode("cp%s", %r)=%a != %a'
2816 % (text, cp, errors, encoded[0], expected))
2817 self.assertEqual(encoded[1], len(text))
2818 else:
2819 self.assertRaises(UnicodeEncodeError,
2820 codecs.code_page_encode, cp, text, errors)
2821
2822 def test_cp932(self):
2823 self.check_encode(932, (
2824 ('abc', 'strict', b'abc'),
2825 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002826 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002827 ('\xff', 'strict', None),
2828 ('[\xff]', 'ignore', b'[]'),
2829 ('[\xff]', 'replace', b'[y]'),
2830 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002831 ('[\xff]', 'backslashreplace', b'[\\xff]'),
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02002832 ('[\xff]', 'namereplace',
2833 b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002834 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002835 ('\udcff', 'strict', None),
2836 ('[\udcff]', 'surrogateescape', b'[\xff]'),
2837 ('[\udcff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002838 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002839 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002840 (b'abc', 'strict', 'abc'),
2841 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2842 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002843 (b'[\xff]', 'strict', None),
2844 (b'[\xff]', 'ignore', '[]'),
2845 (b'[\xff]', 'replace', '[\ufffd]'),
2846 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002847 (b'[\xff]', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002848 (b'\x81\x00abc', 'strict', None),
2849 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002850 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2851 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002852
2853 def test_cp1252(self):
2854 self.check_encode(1252, (
2855 ('abc', 'strict', b'abc'),
2856 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2857 ('\xff', 'strict', b'\xff'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002858 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002859 ('\u0141', 'strict', None),
2860 ('\u0141', 'ignore', b''),
2861 ('\u0141', 'replace', b'L'),
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03002862 ('\udc98', 'surrogateescape', b'\x98'),
2863 ('\udc98', 'surrogatepass', None),
Victor Stinner3a50e702011-10-18 21:21:00 +02002864 ))
2865 self.check_decode(1252, (
2866 (b'abc', 'strict', 'abc'),
2867 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2868 (b'\xff', 'strict', '\xff'),
2869 ))
2870
2871 def test_cp_utf7(self):
2872 cp = 65000
2873 self.check_encode(cp, (
2874 ('abc', 'strict', b'abc'),
2875 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2876 ('\U0010ffff', 'strict', b'+2//f/w-'),
2877 ('\udc80', 'strict', b'+3IA-'),
2878 ('\ufffd', 'strict', b'+//0-'),
2879 ))
2880 self.check_decode(cp, (
2881 (b'abc', 'strict', 'abc'),
2882 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2883 (b'+2//f/w-', 'strict', '\U0010ffff'),
2884 (b'+3IA-', 'strict', '\udc80'),
2885 (b'+//0-', 'strict', '\ufffd'),
2886 # invalid bytes
2887 (b'[+/]', 'strict', '[]'),
2888 (b'[\xff]', 'strict', '[\xff]'),
2889 ))
2890
Victor Stinner3a50e702011-10-18 21:21:00 +02002891 def test_multibyte_encoding(self):
2892 self.check_decode(932, (
2893 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2894 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2895 ))
2896 self.check_decode(self.CP_UTF8, (
2897 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2898 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2899 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002900 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002901 self.check_encode(self.CP_UTF8, (
2902 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2903 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2904 ))
2905
2906 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002907 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2908 self.assertEqual(decoded, ('', 0))
2909
Victor Stinner3a50e702011-10-18 21:21:00 +02002910 decoded = codecs.code_page_decode(932,
2911 b'\xe9\x80\xe9', 'strict',
2912 False)
2913 self.assertEqual(decoded, ('\u9a3e', 2))
2914
2915 decoded = codecs.code_page_decode(932,
2916 b'\xe9\x80\xe9\x80', 'strict',
2917 False)
2918 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2919
2920 decoded = codecs.code_page_decode(932,
2921 b'abc', 'strict',
2922 False)
2923 self.assertEqual(decoded, ('abc', 3))
2924
2925
Fred Drake2e2be372001-09-20 21:33:42 +00002926if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002927 unittest.main()