blob: f6823805feec828cfe1e0aabd94e14cbb64b8387 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10003import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01004import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02005import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01006import sys
7import unittest
8import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10009import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +010010
11from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020012
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020013if sys.platform == 'win32':
14 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
15else:
16 VISTA_OR_LATER = False
17
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018try:
19 import ctypes
20except ImportError:
21 ctypes = None
22 SIZEOF_WCHAR_T = -1
23else:
24 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000025
Serhiy Storchakad6793772013-01-29 10:20:44 +020026def coding_checker(self, coder):
27 def check(input, expect):
28 self.assertEqual(coder(input), (expect, len(input)))
29 return check
30
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Walter Dörwald3abcb012007-04-16 22:10:50 +000051class MixInCheckStateHandling:
52 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000053 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 d = codecs.getincrementaldecoder(encoding)()
55 part1 = d.decode(s[:i])
56 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000057 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # Check that the condition stated in the documentation for
59 # IncrementalDecoder.getstate() holds
60 if not state[1]:
61 # reset decoder to the default state without anything buffered
62 d.setstate((state[0][:0], 0))
63 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000064 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000065 # The decoder must return to the same state
66 self.assertEqual(state, d.getstate())
67 # Create a new decoder and set it to the state
68 # we extracted from the old one
69 d = codecs.getincrementaldecoder(encoding)()
70 d.setstate(state)
71 part2 = d.decode(s[i:], True)
72 self.assertEqual(u, part1+part2)
73
74 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000075 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000076 d = codecs.getincrementalencoder(encoding)()
77 part1 = d.encode(u[:i])
78 state = d.getstate()
79 d = codecs.getincrementalencoder(encoding)()
80 d.setstate(state)
81 part2 = d.encode(u[i:], True)
82 self.assertEqual(s, part1+part2)
83
Ezio Melotti5d3dba02013-01-11 06:02:07 +020084class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000086 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000087 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000088 # the StreamReader and check that the results equal the appropriate
89 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020091 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000095 result += r.read()
96 self.assertEqual(result, partialresult)
97 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000098 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000100
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 # do the check again, this time using a incremental decoder
102 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000103 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 self.assertEqual(result, partialresult)
107 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 self.assertEqual(d.decode(b"", True), "")
109 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000110
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000113 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 self.assertEqual(result, partialresult)
117 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 self.assertEqual(d.decode(b"", True), "")
119 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000120
121 # check iterdecode()
122 encoded = input.encode(self.encoding)
123 self.assertEqual(
124 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000125 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 )
127
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 def test_readline(self):
129 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000130 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 return codecs.getreader(self.encoding)(stream)
132
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200134 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 lines = []
136 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138 if not line:
139 break
140 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000141 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000142
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
144 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
145 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000146 self.assertEqual(readalllines(s, True), sexpected)
147 self.assertEqual(readalllines(s, False), sexpectednoends)
148 self.assertEqual(readalllines(s, True, 10), sexpected)
149 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150
151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000154 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
155 vw.append((i*200)*"\3042" + lineend)
156 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
158 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000163 for lineend in "\n \r\n \r \u2028".split():
164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
171 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000172 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 self.assertEqual(
174 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000175 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000176 )
177
178 def test_bug1175396(self):
179 s = [
180 '<%!--===================================================\r\n',
181 ' BLOG index page: show recent articles,\r\n',
182 ' today\'s articles, or articles of a specific date.\r\n',
183 '========================================================--%>\r\n',
184 '<%@inputencoding="ISO-8859-1"%>\r\n',
185 '<%@pagetemplate=TEMPLATE.y%>\r\n',
186 '<%@import=import frog.util, frog%>\r\n',
187 '<%@import=import frog.objects%>\r\n',
188 '<%@import=from frog.storageerrors import StorageError%>\r\n',
189 '<%\r\n',
190 '\r\n',
191 'import logging\r\n',
192 'log=logging.getLogger("Snakelets.logger")\r\n',
193 '\r\n',
194 '\r\n',
195 'user=self.SessionCtx.user\r\n',
196 'storageEngine=self.SessionCtx.storageEngine\r\n',
197 '\r\n',
198 '\r\n',
199 'def readArticlesFromDate(date, count=None):\r\n',
200 ' entryids=storageEngine.listBlogEntries(date)\r\n',
201 ' entryids.reverse() # descending\r\n',
202 ' if count:\r\n',
203 ' entryids=entryids[:count]\r\n',
204 ' try:\r\n',
205 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
206 ' except StorageError,x:\r\n',
207 ' log.error("Error loading articles: "+str(x))\r\n',
208 ' self.abort("cannot load articles")\r\n',
209 '\r\n',
210 'showdate=None\r\n',
211 '\r\n',
212 'arg=self.Request.getArg()\r\n',
213 'if arg=="today":\r\n',
214 ' #-------------------- TODAY\'S ARTICLES\r\n',
215 ' self.write("<h2>Today\'s articles</h2>")\r\n',
216 ' showdate = frog.util.isodatestr() \r\n',
217 ' entries = readArticlesFromDate(showdate)\r\n',
218 'elif arg=="active":\r\n',
219 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
220 ' self.Yredirect("active.y")\r\n',
221 'elif arg=="login":\r\n',
222 ' #-------------------- LOGIN PAGE redirect\r\n',
223 ' self.Yredirect("login.y")\r\n',
224 'elif arg=="date":\r\n',
225 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
226 ' showdate = self.Request.getParameter("date")\r\n',
227 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
228 ' entries = readArticlesFromDate(showdate)\r\n',
229 'else:\r\n',
230 ' #-------------------- RECENT ARTICLES\r\n',
231 ' self.write("<h2>Recent articles</h2>")\r\n',
232 ' dates=storageEngine.listBlogEntryDates()\r\n',
233 ' if dates:\r\n',
234 ' entries=[]\r\n',
235 ' SHOWAMOUNT=10\r\n',
236 ' for showdate in dates:\r\n',
237 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
238 ' if len(entries)>=SHOWAMOUNT:\r\n',
239 ' break\r\n',
240 ' \r\n',
241 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000242 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200243 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000244 for (i, line) in enumerate(reader):
245 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000246
247 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000248 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200249 writer = codecs.getwriter(self.encoding)(q)
250 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000251
252 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000253 writer.write("foo\r")
254 self.assertEqual(reader.readline(keepends=False), "foo")
255 writer.write("\nbar\r")
256 self.assertEqual(reader.readline(keepends=False), "")
257 self.assertEqual(reader.readline(keepends=False), "bar")
258 writer.write("baz")
259 self.assertEqual(reader.readline(keepends=False), "baz")
260 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000261
262 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 writer.write("foo\r")
264 self.assertEqual(reader.readline(keepends=True), "foo\r")
265 writer.write("\nbar\r")
266 self.assertEqual(reader.readline(keepends=True), "\n")
267 self.assertEqual(reader.readline(keepends=True), "bar\r")
268 writer.write("baz")
269 self.assertEqual(reader.readline(keepends=True), "baz")
270 self.assertEqual(reader.readline(keepends=True), "")
271 writer.write("foo\r\n")
272 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000273
Walter Dörwald9fa09462005-01-10 12:01:39 +0000274 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000275 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
276 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
277 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
279 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000280 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200281 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000282 self.assertEqual(reader.readline(), s1)
283 self.assertEqual(reader.readline(), s2)
284 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000285 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000288 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
289 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
290 s3 = "stillokay:bbbbxx\r\n"
291 s4 = "broken!!!!badbad\r\n"
292 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000293
294 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000295 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200296 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000297 self.assertEqual(reader.readline(), s1)
298 self.assertEqual(reader.readline(), s2)
299 self.assertEqual(reader.readline(), s3)
300 self.assertEqual(reader.readline(), s4)
301 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000302 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000303
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200304 ill_formed_sequence_replace = "\ufffd"
305
306 def test_lone_surrogates(self):
307 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
308 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
309 "[\\udc80]".encode(self.encoding))
310 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
311 "[&#56448;]".encode(self.encoding))
312 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
313 "[]".encode(self.encoding))
314 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
315 "[?]".encode(self.encoding))
316
317 bom = "".encode(self.encoding)
318 for before, after in [("\U00010fff", "A"), ("[", "]"),
319 ("A", "\U00010fff")]:
320 before_sequence = before.encode(self.encoding)[len(bom):]
321 after_sequence = after.encode(self.encoding)[len(bom):]
322 test_string = before + "\uDC80" + after
323 test_sequence = (bom + before_sequence +
324 self.ill_formed_sequence + after_sequence)
325 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
326 self.encoding)
327 self.assertEqual(test_string.encode(self.encoding,
328 "surrogatepass"),
329 test_sequence)
330 self.assertEqual(test_sequence.decode(self.encoding,
331 "surrogatepass"),
332 test_string)
333 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
334 before + after)
335 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
336 before + self.ill_formed_sequence_replace + after)
337
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200338class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000339 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200340 if sys.byteorder == 'little':
341 ill_formed_sequence = b"\x80\xdc\x00\x00"
342 else:
343 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000344
345 spamle = (b'\xff\xfe\x00\x00'
346 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
347 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
348 spambe = (b'\x00\x00\xfe\xff'
349 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
350 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
351
352 def test_only_one_bom(self):
353 _,_,reader,writer = codecs.lookup(self.encoding)
354 # encode some stream
355 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200356 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000357 f.write("spam")
358 f.write("spam")
359 d = s.getvalue()
360 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000361 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000362 # try to read it back
363 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200364 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000365 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366
367 def test_badbom(self):
368 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200369 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000370 self.assertRaises(UnicodeError, f.read)
371
372 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200373 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000374 self.assertRaises(UnicodeError, f.read)
375
376 def test_partial(self):
377 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200378 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000379 [
380 "", # first byte of BOM read
381 "", # second byte of BOM read
382 "", # third byte of BOM read
383 "", # fourth byte of BOM read => byteorder known
384 "",
385 "",
386 "",
387 "\x00",
388 "\x00",
389 "\x00",
390 "\x00",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff",
394 "\x00\xff",
395 "\x00\xff\u0100",
396 "\x00\xff\u0100",
397 "\x00\xff\u0100",
398 "\x00\xff\u0100",
399 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200400 "\x00\xff\u0100\uffff",
401 "\x00\xff\u0100\uffff",
402 "\x00\xff\u0100\uffff",
403 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 ]
405 )
406
Georg Brandl791f4e12009-09-17 11:41:24 +0000407 def test_handlers(self):
408 self.assertEqual(('\ufffd', 1),
409 codecs.utf_32_decode(b'\x01', 'replace', True))
410 self.assertEqual(('', 1),
411 codecs.utf_32_decode(b'\x01', 'ignore', True))
412
Walter Dörwald41980ca2007-08-16 21:55:45 +0000413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
415 b"\xff", "strict", True)
416
417 def test_decoder_state(self):
418 self.check_state_handling_decode(self.encoding,
419 "spamspam", self.spamle)
420 self.check_state_handling_decode(self.encoding,
421 "spamspam", self.spambe)
422
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000423 def test_issue8941(self):
424 # Issue #8941: insufficient result allocation when decoding into
425 # surrogate pairs on UCS-2 builds.
426 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
427 self.assertEqual('\U00010000' * 1024,
428 codecs.utf_32_decode(encoded_le)[0])
429 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
430 self.assertEqual('\U00010000' * 1024,
431 codecs.utf_32_decode(encoded_be)[0])
432
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200433class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000434 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200435 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000436
437 def test_partial(self):
438 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200439 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000440 [
441 "",
442 "",
443 "",
444 "\x00",
445 "\x00",
446 "\x00",
447 "\x00",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff",
451 "\x00\xff",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff",
459 "\x00\xff\u0100\uffff",
460 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 ]
462 )
463
464 def test_simple(self):
465 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
466
467 def test_errors(self):
468 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
469 b"\xff", "strict", True)
470
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000471 def test_issue8941(self):
472 # Issue #8941: insufficient result allocation when decoding into
473 # surrogate pairs on UCS-2 builds.
474 encoded = b'\x00\x00\x01\x00' * 1024
475 self.assertEqual('\U00010000' * 1024,
476 codecs.utf_32_le_decode(encoded)[0])
477
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200478class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000479 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200480 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000481
482 def test_partial(self):
483 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200484 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485 [
486 "",
487 "",
488 "",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000506 ]
507 )
508
509 def test_simple(self):
510 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
511
512 def test_errors(self):
513 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
514 b"\xff", "strict", True)
515
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000516 def test_issue8941(self):
517 # Issue #8941: insufficient result allocation when decoding into
518 # surrogate pairs on UCS-2 builds.
519 encoded = b'\x00\x01\x00\x00' * 1024
520 self.assertEqual('\U00010000' * 1024,
521 codecs.utf_32_be_decode(encoded)[0])
522
523
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200524class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200526 if sys.byteorder == 'little':
527 ill_formed_sequence = b"\x80\xdc"
528 else:
529 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000530
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000531 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
532 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000533
534 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000535 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000536 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000537 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200538 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 f.write("spam")
540 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000541 d = s.getvalue()
542 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000543 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000544 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000545 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200546 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000547 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000548
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000549 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000550 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200551 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000552 self.assertRaises(UnicodeError, f.read)
553
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000554 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200555 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000556 self.assertRaises(UnicodeError, f.read)
557
Walter Dörwald69652032004-09-07 20:24:22 +0000558 def test_partial(self):
559 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200560 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000561 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000562 "", # first byte of BOM read
563 "", # second byte of BOM read => byteorder known
564 "",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100",
571 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff",
575 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 ]
577 )
578
Georg Brandl791f4e12009-09-17 11:41:24 +0000579 def test_handlers(self):
580 self.assertEqual(('\ufffd', 1),
581 codecs.utf_16_decode(b'\x01', 'replace', True))
582 self.assertEqual(('', 1),
583 codecs.utf_16_decode(b'\x01', 'ignore', True))
584
Walter Dörwalde22d3392005-11-17 08:52:34 +0000585 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000586 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000587 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000588
589 def test_decoder_state(self):
590 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000591 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000592 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000594
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000595 def test_bug691291(self):
596 # Files are always opened in binary mode, even if no binary mode was
597 # specified. This means that no automatic conversion of '\n' is done
598 # on reading and writing.
599 s1 = 'Hello\r\nworld\r\n'
600
601 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200602 self.addCleanup(support.unlink, support.TESTFN)
603 with open(support.TESTFN, 'wb') as fp:
604 fp.write(s)
Serhiy Storchaka2480c2e2013-11-24 23:13:26 +0200605 with support.check_warnings(('', DeprecationWarning)):
606 reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
607 with reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200608 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000609
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200610class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000611 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200612 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000613
614 def test_partial(self):
615 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200616 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000617 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000618 "",
619 "\x00",
620 "\x00",
621 "\x00\xff",
622 "\x00\xff",
623 "\x00\xff\u0100",
624 "\x00\xff\u0100",
625 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200626 "\x00\xff\u0100\uffff",
627 "\x00\xff\u0100\uffff",
628 "\x00\xff\u0100\uffff",
629 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000630 ]
631 )
632
Walter Dörwalde22d3392005-11-17 08:52:34 +0000633 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200634 tests = [
635 (b'\xff', '\ufffd'),
636 (b'A\x00Z', 'A\ufffd'),
637 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
638 (b'\x00\xd8', '\ufffd'),
639 (b'\x00\xd8A', '\ufffd'),
640 (b'\x00\xd8A\x00', '\ufffdA'),
641 (b'\x00\xdcA\x00', '\ufffdA'),
642 ]
643 for raw, expected in tests:
644 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
645 raw, 'strict', True)
646 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000647
Victor Stinner53a9dd72010-12-08 22:25:45 +0000648 def test_nonbmp(self):
649 self.assertEqual("\U00010203".encode(self.encoding),
650 b'\x00\xd8\x03\xde')
651 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
652 "\U00010203")
653
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200654class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000655 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200656 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000657
658 def test_partial(self):
659 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200660 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000661 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 "",
663 "\x00",
664 "\x00",
665 "\x00\xff",
666 "\x00\xff",
667 "\x00\xff\u0100",
668 "\x00\xff\u0100",
669 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff",
672 "\x00\xff\u0100\uffff",
673 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000674 ]
675 )
676
Walter Dörwalde22d3392005-11-17 08:52:34 +0000677 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200678 tests = [
679 (b'\xff', '\ufffd'),
680 (b'\x00A\xff', 'A\ufffd'),
681 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
682 (b'\xd8\x00', '\ufffd'),
683 (b'\xd8\x00\xdc', '\ufffd'),
684 (b'\xd8\x00\x00A', '\ufffdA'),
685 (b'\xdc\x00\x00A', '\ufffdA'),
686 ]
687 for raw, expected in tests:
688 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
689 raw, 'strict', True)
690 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000691
Victor Stinner53a9dd72010-12-08 22:25:45 +0000692 def test_nonbmp(self):
693 self.assertEqual("\U00010203".encode(self.encoding),
694 b'\xd8\x00\xde\x03')
695 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
696 "\U00010203")
697
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200698class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000699 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200700 ill_formed_sequence = b"\xed\xb2\x80"
701 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000702
703 def test_partial(self):
704 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200705 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000706 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000707 "\x00",
708 "\x00",
709 "\x00\xff",
710 "\x00\xff",
711 "\x00\xff\u07ff",
712 "\x00\xff\u07ff",
713 "\x00\xff\u07ff",
714 "\x00\xff\u07ff\u0800",
715 "\x00\xff\u07ff\u0800",
716 "\x00\xff\u07ff\u0800",
717 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200718 "\x00\xff\u07ff\u0800\uffff",
719 "\x00\xff\u07ff\u0800\uffff",
720 "\x00\xff\u07ff\u0800\uffff",
721 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000722 ]
723 )
724
Walter Dörwald3abcb012007-04-16 22:10:50 +0000725 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000726 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000727 self.check_state_handling_decode(self.encoding,
728 u, u.encode(self.encoding))
729
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000730 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200731 super().test_lone_surrogates()
732 # not sure if this is making sense for
733 # UTF-16 and UTF-32
734 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000735 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000736
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000737 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000738 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
739 b"abc\xed\xa0\x80def")
740 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
741 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
743 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
744 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
745 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000746 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700747 with self.assertRaises(UnicodeDecodeError):
748 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200749 with self.assertRaises(UnicodeDecodeError):
750 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000751
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200752@unittest.skipUnless(sys.platform == 'win32',
753 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200754class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200755 encoding = "cp65001"
756
757 def test_encode(self):
758 tests = [
759 ('abc', 'strict', b'abc'),
760 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
761 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
762 ]
763 if VISTA_OR_LATER:
764 tests.extend((
765 ('\udc80', 'strict', None),
766 ('\udc80', 'ignore', b''),
767 ('\udc80', 'replace', b'?'),
768 ('\udc80', 'backslashreplace', b'\\udc80'),
769 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
770 ))
771 else:
772 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
773 for text, errors, expected in tests:
774 if expected is not None:
775 try:
776 encoded = text.encode('cp65001', errors)
777 except UnicodeEncodeError as err:
778 self.fail('Unable to encode %a to cp65001 with '
779 'errors=%r: %s' % (text, errors, err))
780 self.assertEqual(encoded, expected,
781 '%a.encode("cp65001", %r)=%a != %a'
782 % (text, errors, encoded, expected))
783 else:
784 self.assertRaises(UnicodeEncodeError,
785 text.encode, "cp65001", errors)
786
787 def test_decode(self):
788 tests = [
789 (b'abc', 'strict', 'abc'),
790 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
791 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
792 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
793 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
794 # invalid bytes
795 (b'[\xff]', 'strict', None),
796 (b'[\xff]', 'ignore', '[]'),
797 (b'[\xff]', 'replace', '[\ufffd]'),
798 (b'[\xff]', 'surrogateescape', '[\udcff]'),
799 ]
800 if VISTA_OR_LATER:
801 tests.extend((
802 (b'[\xed\xb2\x80]', 'strict', None),
803 (b'[\xed\xb2\x80]', 'ignore', '[]'),
804 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
805 ))
806 else:
807 tests.extend((
808 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
809 ))
810 for raw, errors, expected in tests:
811 if expected is not None:
812 try:
813 decoded = raw.decode('cp65001', errors)
814 except UnicodeDecodeError as err:
815 self.fail('Unable to decode %a from cp65001 with '
816 'errors=%r: %s' % (raw, errors, err))
817 self.assertEqual(decoded, expected,
818 '%a.decode("cp65001", %r)=%a != %a'
819 % (raw, errors, decoded, expected))
820 else:
821 self.assertRaises(UnicodeDecodeError,
822 raw.decode, 'cp65001', errors)
823
824 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
825 def test_lone_surrogates(self):
826 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
827 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
828 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
829 b'[\\udc80]')
830 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
831 b'[&#56448;]')
832 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
833 b'[\x80]')
834 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
835 b'[]')
836 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
837 b'[?]')
838
839 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
840 def test_surrogatepass_handler(self):
841 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
842 b"abc\xed\xa0\x80def")
843 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
844 "abc\ud800def")
845 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
846 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
847 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
848 "\U00010fff\uD800")
849 self.assertTrue(codecs.lookup_error("surrogatepass"))
850
851
852
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200853class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000854 encoding = "utf-7"
855
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000856 def test_partial(self):
857 self.check_partial(
858 "a+-b",
859 [
860 "a",
861 "a",
862 "a+",
863 "a+-",
864 "a+-b",
865 ]
866 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000867
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300868 def test_errors(self):
869 tests = [
870 (b'a\xffb', 'a\ufffdb'),
871 (b'a+IK', 'a\ufffd'),
872 (b'a+IK-b', 'a\ufffdb'),
873 (b'a+IK,b', 'a\ufffdb'),
874 (b'a+IKx', 'a\u20ac\ufffd'),
875 (b'a+IKx-b', 'a\u20ac\ufffdb'),
876 (b'a+IKwgr', 'a\u20ac\ufffd'),
877 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
878 (b'a+IKwgr,', 'a\u20ac\ufffd'),
879 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
880 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
881 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
882 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
883 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
884 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
885 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
886 ]
887 for raw, expected in tests:
888 with self.subTest(raw=raw):
889 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
890 raw, 'strict', True)
891 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
892
893 def test_nonbmp(self):
894 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
895 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
896 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
897
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200898 test_lone_surrogates = None
899
900
Walter Dörwalde22d3392005-11-17 08:52:34 +0000901class UTF16ExTest(unittest.TestCase):
902
903 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000904 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000905
906 def test_bad_args(self):
907 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
908
909class ReadBufferTest(unittest.TestCase):
910
911 def test_array(self):
912 import array
913 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000914 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000915 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000916 )
917
918 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000919 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000920
921 def test_bad_args(self):
922 self.assertRaises(TypeError, codecs.readbuffer_encode)
923 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
924
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200925class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000926 encoding = "utf-8-sig"
927
928 def test_partial(self):
929 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200930 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000931 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000932 "",
933 "",
934 "", # First BOM has been read and skipped
935 "",
936 "",
937 "\ufeff", # Second BOM has been read and emitted
938 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000939 "\ufeff\x00", # First byte of encoded "\xff" read
940 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
941 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
942 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000943 "\ufeff\x00\xff\u07ff",
944 "\ufeff\x00\xff\u07ff",
945 "\ufeff\x00\xff\u07ff\u0800",
946 "\ufeff\x00\xff\u07ff\u0800",
947 "\ufeff\x00\xff\u07ff\u0800",
948 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200949 "\ufeff\x00\xff\u07ff\u0800\uffff",
950 "\ufeff\x00\xff\u07ff\u0800\uffff",
951 "\ufeff\x00\xff\u07ff\u0800\uffff",
952 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000953 ]
954 )
955
Thomas Wouters89f507f2006-12-13 04:49:30 +0000956 def test_bug1601501(self):
957 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000958 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000959
Walter Dörwald3abcb012007-04-16 22:10:50 +0000960 def test_bom(self):
961 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000962 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000963 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
964
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000965 def test_stream_bom(self):
966 unistring = "ABC\u00A1\u2200XYZ"
967 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
968
969 reader = codecs.getreader("utf-8-sig")
970 for sizehint in [None] + list(range(1, 11)) + \
971 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200972 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000973 ostream = io.StringIO()
974 while 1:
975 if sizehint is not None:
976 data = istream.read(sizehint)
977 else:
978 data = istream.read()
979
980 if not data:
981 break
982 ostream.write(data)
983
984 got = ostream.getvalue()
985 self.assertEqual(got, unistring)
986
987 def test_stream_bare(self):
988 unistring = "ABC\u00A1\u2200XYZ"
989 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
990
991 reader = codecs.getreader("utf-8-sig")
992 for sizehint in [None] + list(range(1, 11)) + \
993 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200994 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000995 ostream = io.StringIO()
996 while 1:
997 if sizehint is not None:
998 data = istream.read(sizehint)
999 else:
1000 data = istream.read()
1001
1002 if not data:
1003 break
1004 ostream.write(data)
1005
1006 got = ostream.getvalue()
1007 self.assertEqual(got, unistring)
1008
1009class EscapeDecodeTest(unittest.TestCase):
1010 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001011 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001012
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001013 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001014 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001015 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001016 b = bytes([b])
1017 if b != b'\\':
1018 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001019
1020 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001021 decode = codecs.escape_decode
1022 check = coding_checker(self, decode)
1023 check(b"[\\\n]", b"[]")
1024 check(br'[\"]', b'["]')
1025 check(br"[\']", b"[']")
1026 check(br"[\\]", br"[\]")
1027 check(br"[\a]", b"[\x07]")
1028 check(br"[\b]", b"[\x08]")
1029 check(br"[\t]", b"[\x09]")
1030 check(br"[\n]", b"[\x0a]")
1031 check(br"[\v]", b"[\x0b]")
1032 check(br"[\f]", b"[\x0c]")
1033 check(br"[\r]", b"[\x0d]")
1034 check(br"[\7]", b"[\x07]")
1035 check(br"[\8]", br"[\8]")
1036 check(br"[\78]", b"[\x078]")
1037 check(br"[\41]", b"[!]")
1038 check(br"[\418]", b"[!8]")
1039 check(br"[\101]", b"[A]")
1040 check(br"[\1010]", b"[A0]")
1041 check(br"[\501]", b"[A]")
1042 check(br"[\x41]", b"[A]")
1043 check(br"[\X41]", br"[\X41]")
1044 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001045 for b in range(256):
1046 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001047 b = bytes([b])
1048 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001049
1050 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001051 decode = codecs.escape_decode
1052 self.assertRaises(ValueError, decode, br"\x")
1053 self.assertRaises(ValueError, decode, br"[\x]")
1054 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1055 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1056 self.assertRaises(ValueError, decode, br"\x0")
1057 self.assertRaises(ValueError, decode, br"[\x0]")
1058 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1059 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001060
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001061class RecodingTest(unittest.TestCase):
1062 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001063 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001064 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001065 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001066 f2.close()
1067 # Python used to crash on this at exit because of a refcount
1068 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001069
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070# From RFC 3492
1071punycode_testcases = [
1072 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001073 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1074 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001075 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001077 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001078 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001079 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001080 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001081 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001082 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001083 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1084 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1085 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001086 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001087 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001088 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1089 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1090 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001091 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001093 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001094 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1095 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1096 "\u0939\u0948\u0902",
1097 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098
1099 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001100 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001101 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1102 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001103
1104 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001105 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1106 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1107 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001108 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1109 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110
1111 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001112 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1113 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1114 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1115 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001116 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001117
1118 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001119 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1120 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1121 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1122 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1123 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001124 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125
1126 # (K) Vietnamese:
1127 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1128 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001129 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1130 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1131 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1132 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001133 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001134
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001136 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001137 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001138
Martin v. Löwis2548c732003-04-18 10:39:54 +00001139 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001140 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1141 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1142 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001143 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144
1145 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001146 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1147 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1148 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001149 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150
1151 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001153 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154
1155 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1157 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001158 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159
1160 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001162 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163
1164 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001165 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001166 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001167
1168 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001169 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1170 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001171 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172 ]
1173
1174for i in punycode_testcases:
1175 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001176 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001177
1178class PunycodeTest(unittest.TestCase):
1179 def test_encode(self):
1180 for uni, puny in punycode_testcases:
1181 # Need to convert both strings to lower case, since
1182 # some of the extended encodings use upper case, but our
1183 # code produces only lower case. Converting just puny to
1184 # lower is also insufficient, since some of the input characters
1185 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001186 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001187 str(uni.encode("punycode"), "ascii").lower(),
1188 str(puny, "ascii").lower()
1189 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001190
1191 def test_decode(self):
1192 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001193 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001194 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001195 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001197class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001198 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001199 def test_bug1251300(self):
1200 # Decoding with unicode_internal used to not correctly handle "code
1201 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001202 ok = [
1203 (b"\x00\x10\xff\xff", "\U0010ffff"),
1204 (b"\x00\x00\x01\x01", "\U00000101"),
1205 (b"", ""),
1206 ]
1207 not_ok = [
1208 b"\x7f\xff\xff\xff",
1209 b"\x80\x00\x00\x00",
1210 b"\x81\x00\x00\x00",
1211 b"\x00",
1212 b"\x00\x00\x00\x00\x00",
1213 ]
1214 for internal, uni in ok:
1215 if sys.byteorder == "little":
1216 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001217 with support.check_warnings():
1218 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001219 for internal in not_ok:
1220 if sys.byteorder == "little":
1221 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001222 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001223 'deprecated', DeprecationWarning)):
1224 self.assertRaises(UnicodeDecodeError, internal.decode,
1225 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001226 if sys.byteorder == "little":
1227 invalid = b"\x00\x00\x11\x00"
1228 else:
1229 invalid = b"\x00\x11\x00\x00"
1230 with support.check_warnings():
1231 self.assertRaises(UnicodeDecodeError,
1232 invalid.decode, "unicode_internal")
1233 with support.check_warnings():
1234 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1235 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001236
Victor Stinner182d90d2011-09-29 19:53:55 +02001237 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001238 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001239 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001240 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001241 'deprecated', DeprecationWarning)):
1242 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001243 except UnicodeDecodeError as ex:
1244 self.assertEqual("unicode_internal", ex.encoding)
1245 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1246 self.assertEqual(4, ex.start)
1247 self.assertEqual(8, ex.end)
1248 else:
1249 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001250
Victor Stinner182d90d2011-09-29 19:53:55 +02001251 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001252 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001253 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1254 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001255 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001256 'deprecated', DeprecationWarning)):
1257 ab = "ab".encode("unicode_internal").decode()
1258 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1259 "ascii"),
1260 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001261 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001262
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001263 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001264 with support.check_warnings(('unicode_internal codec has been '
1265 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001266 # Issue 3739
1267 encoder = codecs.getencoder("unicode_internal")
1268 self.assertEqual(encoder("a")[1], 1)
1269 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1270
1271 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001272
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1274nameprep_tests = [
1275 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001276 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1277 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1278 b'\xb8\x8f\xef\xbb\xbf',
1279 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001281 (b'CAFE',
1282 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001283 # 3.3 Case folding 8bit U+00DF (german sharp s).
1284 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 (b'\xc3\x9f',
1286 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'\xc4\xb0',
1289 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001290 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001291 (b'\xc5\x83\xcd\xba',
1292 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1294 # XXX: skip this as it fails in UCS-2 mode
1295 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1296 # 'telc\xe2\x88\x95kg\xcf\x83'),
1297 (None, None),
1298 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001299 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1300 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001302 (b'\xe1\xbe\xb7',
1303 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001304 # 3.9 Self-reverting case folding U+01F0 and normalization.
1305 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001306 (b'\xc7\xb0',
1307 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001309 (b'\xce\x90',
1310 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001312 (b'\xce\xb0',
1313 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'\xe1\xba\x96',
1316 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001318 (b'\xe1\xbd\x96',
1319 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001321 (b' ',
1322 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001323 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001324 (b'\xc2\xa0',
1325 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001327 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328 None),
1329 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001330 (b'\xe2\x80\x80',
1331 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 (b'\xe2\x80\x8b',
1334 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001336 (b'\xe3\x80\x80',
1337 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001338 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001339 (b'\x10\x7f',
1340 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001342 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001343 None),
1344 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001345 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346 None),
1347 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001348 (b'\xef\xbb\xbf',
1349 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 None),
1353 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001354 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 None),
1356 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001357 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 None),
1359 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 None),
1362 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 None),
1365 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 None),
1368 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001369 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 None),
1371 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373 None),
1374 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001375 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001376 None),
1377 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001378 (b'\xcd\x81',
1379 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001380 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 None),
1383 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 None),
1386 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 None),
1389 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 None),
1392 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 None),
1395 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001397 None),
1398 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'foo\xef\xb9\xb6bar',
1400 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001403 None),
1404 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001405 (b'\xd8\xa71\xd8\xa8',
1406 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001407 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001408 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001409 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001410 # None),
1411 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001412 # 3.44 Larger test (shrinking).
1413 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001414 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1415 b'\xaa\xce\xb0\xe2\x80\x80',
1416 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 # 3.45 Larger test (expanding).
1418 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1420 b'\x80',
1421 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1422 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1423 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001424 ]
1425
1426
1427class NameprepTest(unittest.TestCase):
1428 def test_nameprep(self):
1429 from encodings.idna import nameprep
1430 for pos, (orig, prepped) in enumerate(nameprep_tests):
1431 if orig is None:
1432 # Skipped
1433 continue
1434 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001435 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001436 if prepped is None:
1437 # Input contains prohibited characters
1438 self.assertRaises(UnicodeError, nameprep, orig)
1439 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001440 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001441 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001442 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001443 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001444 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001445
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001446class IDNACodecTest(unittest.TestCase):
1447 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001448 self.assertEqual(str(b"python.org", "idna"), "python.org")
1449 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1450 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1451 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001452
1453 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001454 self.assertEqual("python.org".encode("idna"), b"python.org")
1455 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1456 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1457 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001458
Martin v. Löwis8b595142005-08-25 11:03:38 +00001459 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001460 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001461 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001462 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001463
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001464 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001465 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001466 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001467 "python.org"
1468 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001469 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001470 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001471 "python.org."
1472 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001473 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001474 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001475 "pyth\xf6n.org."
1476 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001477 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001478 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001479 "pyth\xf6n.org."
1480 )
1481
1482 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001483 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1484 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1485 self.assertEqual(decoder.decode(b"rg"), "")
1486 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001487
1488 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001489 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1490 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1491 self.assertEqual(decoder.decode(b"rg."), "org.")
1492 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001493
1494 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001495 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001496 b"".join(codecs.iterencode("python.org", "idna")),
1497 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001498 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001499 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001500 b"".join(codecs.iterencode("python.org.", "idna")),
1501 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001502 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001503 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001504 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1505 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001506 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001507 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001508 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1509 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001510 )
1511
1512 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001513 self.assertEqual(encoder.encode("\xe4x"), b"")
1514 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1515 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001516
1517 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001518 self.assertEqual(encoder.encode("\xe4x"), b"")
1519 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1520 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001521
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001522class CodecsModuleTest(unittest.TestCase):
1523
1524 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001525 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1526 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001527 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001528 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001529 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001530
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001531 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001532 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1533 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001534 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001535 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001536 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001538
1539 def test_register(self):
1540 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001541 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001542
1543 def test_lookup(self):
1544 self.assertRaises(TypeError, codecs.lookup)
1545 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001546 self.assertRaises(LookupError, codecs.lookup, " ")
1547
1548 def test_getencoder(self):
1549 self.assertRaises(TypeError, codecs.getencoder)
1550 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1551
1552 def test_getdecoder(self):
1553 self.assertRaises(TypeError, codecs.getdecoder)
1554 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1555
1556 def test_getreader(self):
1557 self.assertRaises(TypeError, codecs.getreader)
1558 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1559
1560 def test_getwriter(self):
1561 self.assertRaises(TypeError, codecs.getwriter)
1562 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001563
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001564 def test_lookup_issue1813(self):
1565 # Issue #1813: under Turkish locales, lookup of some codecs failed
1566 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001567 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001568 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1569 try:
1570 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1571 except locale.Error:
1572 # Unsupported locale on this system
1573 self.skipTest('test needs Turkish locale')
1574 c = codecs.lookup('ASCII')
1575 self.assertEqual(c.name, 'ascii')
1576
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001577class StreamReaderTest(unittest.TestCase):
1578
1579 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001580 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001581 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001582
1583 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001584 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001585 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001586
Thomas Wouters89f507f2006-12-13 04:49:30 +00001587class EncodedFileTest(unittest.TestCase):
1588
1589 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001590 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001591 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001592 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001593
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001594 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001595 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001596 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001597 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001598
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001599all_unicode_encodings = [
1600 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001601 "big5",
1602 "big5hkscs",
1603 "charmap",
1604 "cp037",
1605 "cp1006",
1606 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001607 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001608 "cp1140",
1609 "cp1250",
1610 "cp1251",
1611 "cp1252",
1612 "cp1253",
1613 "cp1254",
1614 "cp1255",
1615 "cp1256",
1616 "cp1257",
1617 "cp1258",
1618 "cp424",
1619 "cp437",
1620 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001621 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001622 "cp737",
1623 "cp775",
1624 "cp850",
1625 "cp852",
1626 "cp855",
1627 "cp856",
1628 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001629 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001630 "cp860",
1631 "cp861",
1632 "cp862",
1633 "cp863",
1634 "cp864",
1635 "cp865",
1636 "cp866",
1637 "cp869",
1638 "cp874",
1639 "cp875",
1640 "cp932",
1641 "cp949",
1642 "cp950",
1643 "euc_jis_2004",
1644 "euc_jisx0213",
1645 "euc_jp",
1646 "euc_kr",
1647 "gb18030",
1648 "gb2312",
1649 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001650 "hp_roman8",
1651 "hz",
1652 "idna",
1653 "iso2022_jp",
1654 "iso2022_jp_1",
1655 "iso2022_jp_2",
1656 "iso2022_jp_2004",
1657 "iso2022_jp_3",
1658 "iso2022_jp_ext",
1659 "iso2022_kr",
1660 "iso8859_1",
1661 "iso8859_10",
1662 "iso8859_11",
1663 "iso8859_13",
1664 "iso8859_14",
1665 "iso8859_15",
1666 "iso8859_16",
1667 "iso8859_2",
1668 "iso8859_3",
1669 "iso8859_4",
1670 "iso8859_5",
1671 "iso8859_6",
1672 "iso8859_7",
1673 "iso8859_8",
1674 "iso8859_9",
1675 "johab",
1676 "koi8_r",
1677 "koi8_u",
1678 "latin_1",
1679 "mac_cyrillic",
1680 "mac_greek",
1681 "mac_iceland",
1682 "mac_latin2",
1683 "mac_roman",
1684 "mac_turkish",
1685 "palmos",
1686 "ptcp154",
1687 "punycode",
1688 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001689 "shift_jis",
1690 "shift_jis_2004",
1691 "shift_jisx0213",
1692 "tis_620",
1693 "unicode_escape",
1694 "unicode_internal",
1695 "utf_16",
1696 "utf_16_be",
1697 "utf_16_le",
1698 "utf_7",
1699 "utf_8",
1700]
1701
1702if hasattr(codecs, "mbcs_encode"):
1703 all_unicode_encodings.append("mbcs")
1704
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001705# The following encoding is not tested, because it's not supposed
1706# to work:
1707# "undefined"
1708
1709# The following encodings don't work in stateful mode
1710broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001711 "punycode",
1712 "unicode_internal"
1713]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001714broken_incremental_coders = broken_unicode_with_streams + [
1715 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001716]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001717
Walter Dörwald3abcb012007-04-16 22:10:50 +00001718class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001719 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001720 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001721 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001722 name = codecs.lookup(encoding).name
1723 if encoding.endswith("_codec"):
1724 name += "_codec"
1725 elif encoding == "latin_1":
1726 name = "latin_1"
1727 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001728
Ezio Melottiadc417c2011-11-17 12:23:34 +02001729 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001730 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001731 (b, size) = codecs.getencoder(encoding)(s)
1732 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1733 (chars, size) = codecs.getdecoder(encoding)(b)
1734 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001735
1736 if encoding not in broken_unicode_with_streams:
1737 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001738 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001739 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001740 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001741 for c in s:
1742 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001743 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001744 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001745 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001746 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001747 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001748 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001750 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001751 decodedresult += reader.read()
1752 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1753
Thomas Wouters89f507f2006-12-13 04:49:30 +00001754 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001755 # check incremental decoder/encoder (fetched via the Python
1756 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001757 try:
1758 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001759 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001760 except LookupError: # no IncrementalEncoder
1761 pass
1762 else:
1763 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001764 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001765 for c in s:
1766 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001767 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001768 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001769 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001770 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001771 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001772 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001773 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1774
1775 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001776 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001777 for c in s:
1778 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001779 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001780 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001781 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001782 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001783 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001784 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001785 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1786
1787 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001788 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001789 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1790
1791 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001792 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1793 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001794
Victor Stinner554f3f02010-06-16 23:33:54 +00001795 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001796 # check incremental decoder/encoder with errors argument
1797 try:
1798 encoder = codecs.getincrementalencoder(encoding)("ignore")
1799 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1800 except LookupError: # no IncrementalEncoder
1801 pass
1802 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001803 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001804 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001805 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001806 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1807
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001808 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001809 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001810 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001811 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1812
Walter Dörwald729c31f2005-03-14 19:06:30 +00001813 def test_seek(self):
1814 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001815 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001816 for encoding in all_unicode_encodings:
1817 if encoding == "idna": # FIXME: See SF bug #1163178
1818 continue
1819 if encoding in broken_unicode_with_streams:
1820 continue
Victor Stinner05010702011-05-27 16:50:40 +02001821 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001822 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001823 # Test that calling seek resets the internal codec state and buffers
1824 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001825 data = reader.read()
1826 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001827
Walter Dörwalde22d3392005-11-17 08:52:34 +00001828 def test_bad_decode_args(self):
1829 for encoding in all_unicode_encodings:
1830 decoder = codecs.getdecoder(encoding)
1831 self.assertRaises(TypeError, decoder)
1832 if encoding not in ("idna", "punycode"):
1833 self.assertRaises(TypeError, decoder, 42)
1834
1835 def test_bad_encode_args(self):
1836 for encoding in all_unicode_encodings:
1837 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001838 with support.check_warnings():
1839 # unicode-internal has been deprecated
1840 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001841
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001842 def test_encoding_map_type_initialized(self):
1843 from encodings import cp1140
1844 # This used to crash, we are only verifying there's no crash.
1845 table_type = type(cp1140.encoding_table)
1846 self.assertEqual(table_type, table_type)
1847
Walter Dörwald3abcb012007-04-16 22:10:50 +00001848 def test_decoder_state(self):
1849 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001850 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001851 for encoding in all_unicode_encodings:
1852 if encoding not in broken_incremental_coders:
1853 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1854 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1855
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001856class CharmapTest(unittest.TestCase):
1857 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001858 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001859 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001860 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001861 )
1862
Ezio Melottib3aedd42010-11-20 19:04:17 +00001863 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001864 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1865 ("\U0010FFFFbc", 3)
1866 )
1867
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001868 self.assertRaises(UnicodeDecodeError,
1869 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1870 )
1871
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001872 self.assertRaises(UnicodeDecodeError,
1873 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1874 )
1875
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001876 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001877 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001878 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001879 )
1880
Ezio Melottib3aedd42010-11-20 19:04:17 +00001881 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001882 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001883 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001884 )
1885
Ezio Melottib3aedd42010-11-20 19:04:17 +00001886 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001887 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001888 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001889 )
1890
Ezio Melottib3aedd42010-11-20 19:04:17 +00001891 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001892 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001893 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001894 )
1895
Guido van Rossum805365e2007-05-07 22:24:25 +00001896 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001897 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001898 codecs.charmap_decode(allbytes, "ignore", ""),
1899 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001900 )
1901
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001902 def test_decode_with_int2str_map(self):
1903 self.assertEqual(
1904 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1905 {0: 'a', 1: 'b', 2: 'c'}),
1906 ("abc", 3)
1907 )
1908
1909 self.assertEqual(
1910 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1911 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1912 ("AaBbCc", 3)
1913 )
1914
1915 self.assertEqual(
1916 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1917 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1918 ("\U0010FFFFbc", 3)
1919 )
1920
1921 self.assertEqual(
1922 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1923 {0: 'a', 1: 'b', 2: ''}),
1924 ("ab", 3)
1925 )
1926
1927 self.assertRaises(UnicodeDecodeError,
1928 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1929 {0: 'a', 1: 'b'}
1930 )
1931
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001932 self.assertRaises(UnicodeDecodeError,
1933 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1934 {0: 'a', 1: 'b', 2: None}
1935 )
1936
1937 # Issue #14850
1938 self.assertRaises(UnicodeDecodeError,
1939 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1940 {0: 'a', 1: 'b', 2: '\ufffe'}
1941 )
1942
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001943 self.assertEqual(
1944 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1945 {0: 'a', 1: 'b'}),
1946 ("ab\ufffd", 3)
1947 )
1948
1949 self.assertEqual(
1950 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1951 {0: 'a', 1: 'b', 2: None}),
1952 ("ab\ufffd", 3)
1953 )
1954
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001955 # Issue #14850
1956 self.assertEqual(
1957 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1958 {0: 'a', 1: 'b', 2: '\ufffe'}),
1959 ("ab\ufffd", 3)
1960 )
1961
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001962 self.assertEqual(
1963 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1964 {0: 'a', 1: 'b'}),
1965 ("ab", 3)
1966 )
1967
1968 self.assertEqual(
1969 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1970 {0: 'a', 1: 'b', 2: None}),
1971 ("ab", 3)
1972 )
1973
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001974 # Issue #14850
1975 self.assertEqual(
1976 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1977 {0: 'a', 1: 'b', 2: '\ufffe'}),
1978 ("ab", 3)
1979 )
1980
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001981 allbytes = bytes(range(256))
1982 self.assertEqual(
1983 codecs.charmap_decode(allbytes, "ignore", {}),
1984 ("", len(allbytes))
1985 )
1986
1987 def test_decode_with_int2int_map(self):
1988 a = ord('a')
1989 b = ord('b')
1990 c = ord('c')
1991
1992 self.assertEqual(
1993 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1994 {0: a, 1: b, 2: c}),
1995 ("abc", 3)
1996 )
1997
1998 # Issue #15379
1999 self.assertEqual(
2000 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2001 {0: 0x10FFFF, 1: b, 2: c}),
2002 ("\U0010FFFFbc", 3)
2003 )
2004
Antoine Pitroua1f76552012-09-23 20:00:04 +02002005 self.assertEqual(
2006 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2007 {0: sys.maxunicode, 1: b, 2: c}),
2008 (chr(sys.maxunicode) + "bc", 3)
2009 )
2010
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002011 self.assertRaises(TypeError,
2012 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002013 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002014 )
2015
2016 self.assertRaises(UnicodeDecodeError,
2017 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2018 {0: a, 1: b},
2019 )
2020
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002021 self.assertRaises(UnicodeDecodeError,
2022 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2023 {0: a, 1: b, 2: 0xFFFE},
2024 )
2025
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002026 self.assertEqual(
2027 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2028 {0: a, 1: b}),
2029 ("ab\ufffd", 3)
2030 )
2031
2032 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002033 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2034 {0: a, 1: b, 2: 0xFFFE}),
2035 ("ab\ufffd", 3)
2036 )
2037
2038 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002039 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2040 {0: a, 1: b}),
2041 ("ab", 3)
2042 )
2043
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002044 self.assertEqual(
2045 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2046 {0: a, 1: b, 2: 0xFFFE}),
2047 ("ab", 3)
2048 )
2049
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002050
Thomas Wouters89f507f2006-12-13 04:49:30 +00002051class WithStmtTest(unittest.TestCase):
2052 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002053 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002054 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2055 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002056
2057 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002058 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002059 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002060 with codecs.StreamReaderWriter(f, info.streamreader,
2061 info.streamwriter, 'strict') as srw:
2062 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002063
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002064class TypesTest(unittest.TestCase):
2065 def test_decode_unicode(self):
2066 # Most decoders don't accept unicode input
2067 decoders = [
2068 codecs.utf_7_decode,
2069 codecs.utf_8_decode,
2070 codecs.utf_16_le_decode,
2071 codecs.utf_16_be_decode,
2072 codecs.utf_16_ex_decode,
2073 codecs.utf_32_decode,
2074 codecs.utf_32_le_decode,
2075 codecs.utf_32_be_decode,
2076 codecs.utf_32_ex_decode,
2077 codecs.latin_1_decode,
2078 codecs.ascii_decode,
2079 codecs.charmap_decode,
2080 ]
2081 if hasattr(codecs, "mbcs_decode"):
2082 decoders.append(codecs.mbcs_decode)
2083 for decoder in decoders:
2084 self.assertRaises(TypeError, decoder, "xxx")
2085
2086 def test_unicode_escape(self):
2087 # Escape-decoding an unicode string is supported ang gives the same
2088 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002089 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2090 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2091 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2092 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002093
Victor Stinnere3b47152011-12-09 20:49:49 +01002094 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2095 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2096
2097 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2098 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2099
Serhiy Storchakad6793772013-01-29 10:20:44 +02002100
2101class UnicodeEscapeTest(unittest.TestCase):
2102 def test_empty(self):
2103 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2104 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2105
2106 def test_raw_encode(self):
2107 encode = codecs.unicode_escape_encode
2108 for b in range(32, 127):
2109 if b != b'\\'[0]:
2110 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2111
2112 def test_raw_decode(self):
2113 decode = codecs.unicode_escape_decode
2114 for b in range(256):
2115 if b != b'\\'[0]:
2116 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2117
2118 def test_escape_encode(self):
2119 encode = codecs.unicode_escape_encode
2120 check = coding_checker(self, encode)
2121 check('\t', br'\t')
2122 check('\n', br'\n')
2123 check('\r', br'\r')
2124 check('\\', br'\\')
2125 for b in range(32):
2126 if chr(b) not in '\t\n\r':
2127 check(chr(b), ('\\x%02x' % b).encode())
2128 for b in range(127, 256):
2129 check(chr(b), ('\\x%02x' % b).encode())
2130 check('\u20ac', br'\u20ac')
2131 check('\U0001d120', br'\U0001d120')
2132
2133 def test_escape_decode(self):
2134 decode = codecs.unicode_escape_decode
2135 check = coding_checker(self, decode)
2136 check(b"[\\\n]", "[]")
2137 check(br'[\"]', '["]')
2138 check(br"[\']", "[']")
2139 check(br"[\\]", r"[\]")
2140 check(br"[\a]", "[\x07]")
2141 check(br"[\b]", "[\x08]")
2142 check(br"[\t]", "[\x09]")
2143 check(br"[\n]", "[\x0a]")
2144 check(br"[\v]", "[\x0b]")
2145 check(br"[\f]", "[\x0c]")
2146 check(br"[\r]", "[\x0d]")
2147 check(br"[\7]", "[\x07]")
2148 check(br"[\8]", r"[\8]")
2149 check(br"[\78]", "[\x078]")
2150 check(br"[\41]", "[!]")
2151 check(br"[\418]", "[!8]")
2152 check(br"[\101]", "[A]")
2153 check(br"[\1010]", "[A0]")
2154 check(br"[\x41]", "[A]")
2155 check(br"[\x410]", "[A0]")
2156 check(br"\u20ac", "\u20ac")
2157 check(br"\U0001d120", "\U0001d120")
2158 for b in range(256):
2159 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2160 check(b'\\' + bytes([b]), '\\' + chr(b))
2161
2162 def test_decode_errors(self):
2163 decode = codecs.unicode_escape_decode
2164 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2165 for i in range(d):
2166 self.assertRaises(UnicodeDecodeError, decode,
2167 b"\\" + c + b"0"*i)
2168 self.assertRaises(UnicodeDecodeError, decode,
2169 b"[\\" + c + b"0"*i + b"]")
2170 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2171 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2172 self.assertEqual(decode(data, "replace"),
2173 ("[\ufffd]\ufffd", len(data)))
2174 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2175 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2176 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2177
2178
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002179class RawUnicodeEscapeTest(unittest.TestCase):
2180 def test_empty(self):
2181 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2182 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2183
2184 def test_raw_encode(self):
2185 encode = codecs.raw_unicode_escape_encode
2186 for b in range(256):
2187 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2188
2189 def test_raw_decode(self):
2190 decode = codecs.raw_unicode_escape_decode
2191 for b in range(256):
2192 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2193
2194 def test_escape_encode(self):
2195 encode = codecs.raw_unicode_escape_encode
2196 check = coding_checker(self, encode)
2197 for b in range(256):
2198 if b not in b'uU':
2199 check('\\' + chr(b), b'\\' + bytes([b]))
2200 check('\u20ac', br'\u20ac')
2201 check('\U0001d120', br'\U0001d120')
2202
2203 def test_escape_decode(self):
2204 decode = codecs.raw_unicode_escape_decode
2205 check = coding_checker(self, decode)
2206 for b in range(256):
2207 if b not in b'uU':
2208 check(b'\\' + bytes([b]), '\\' + chr(b))
2209 check(br"\u20ac", "\u20ac")
2210 check(br"\U0001d120", "\U0001d120")
2211
2212 def test_decode_errors(self):
2213 decode = codecs.raw_unicode_escape_decode
2214 for c, d in (b'u', 4), (b'U', 4):
2215 for i in range(d):
2216 self.assertRaises(UnicodeDecodeError, decode,
2217 b"\\" + c + b"0"*i)
2218 self.assertRaises(UnicodeDecodeError, decode,
2219 b"[\\" + c + b"0"*i + b"]")
2220 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2221 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2222 self.assertEqual(decode(data, "replace"),
2223 ("[\ufffd]\ufffd", len(data)))
2224 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2225 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2226 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2227
2228
Martin v. Löwis43c57782009-05-10 08:15:24 +00002229class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002230
2231 def test_utf8(self):
2232 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002233 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002234 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002235 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002236 b"foo\x80bar")
2237 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002238 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002239 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002240 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002241 b"\xed\xb0\x80")
2242
2243 def test_ascii(self):
2244 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002245 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002246 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002247 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002248 b"foo\x80bar")
2249
2250 def test_charmap(self):
2251 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002252 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002253 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002254 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002255 b"foo\xa5bar")
2256
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002257 def test_latin1(self):
2258 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002259 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002260 b"\xe4\xeb\xef\xf6\xfc")
2261
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002262
Victor Stinner3fed0872010-05-22 02:16:27 +00002263class BomTest(unittest.TestCase):
2264 def test_seek0(self):
2265 data = "1234567890"
2266 tests = ("utf-16",
2267 "utf-16-le",
2268 "utf-16-be",
2269 "utf-32",
2270 "utf-32-le",
2271 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002272 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002273 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002274 # Check if the BOM is written only once
2275 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002276 f.write(data)
2277 f.write(data)
2278 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002279 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002280 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002281 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002282
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002283 # Check that the BOM is written after a seek(0)
2284 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2285 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002286 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002287 f.seek(0)
2288 f.write(data)
2289 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002290 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002291
2292 # (StreamWriter) Check that the BOM is written after a seek(0)
2293 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002294 f.writer.write(data[0])
2295 self.assertNotEqual(f.writer.tell(), 0)
2296 f.writer.seek(0)
2297 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002298 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002299 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002300
Victor Stinner05010702011-05-27 16:50:40 +02002301 # Check that the BOM is not written after a seek() at a position
2302 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002303 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2304 f.write(data)
2305 f.seek(f.tell())
2306 f.write(data)
2307 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002308 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002309
Victor Stinner05010702011-05-27 16:50:40 +02002310 # (StreamWriter) Check that the BOM is not written after a seek()
2311 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002312 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002313 f.writer.write(data)
2314 f.writer.seek(f.writer.tell())
2315 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002316 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002317 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002318
Victor Stinner3fed0872010-05-22 02:16:27 +00002319
Georg Brandl02524622010-12-02 18:06:51 +00002320bytes_transform_encodings = [
2321 "base64_codec",
2322 "uu_codec",
2323 "quopri_codec",
2324 "hex_codec",
2325]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002326
2327transform_aliases = {
2328 "base64_codec": ["base64", "base_64"],
2329 "uu_codec": ["uu"],
2330 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2331 "hex_codec": ["hex"],
2332 "rot_13": ["rot13"],
2333}
2334
Georg Brandl02524622010-12-02 18:06:51 +00002335try:
2336 import zlib
2337except ImportError:
2338 pass
2339else:
2340 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002341 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002342try:
2343 import bz2
2344except ImportError:
2345 pass
2346else:
2347 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002348 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002349
2350class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002351
Georg Brandl02524622010-12-02 18:06:51 +00002352 def test_basics(self):
2353 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002354 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002355 with self.subTest(encoding=encoding):
2356 # generic codecs interface
2357 (o, size) = codecs.getencoder(encoding)(binput)
2358 self.assertEqual(size, len(binput))
2359 (i, size) = codecs.getdecoder(encoding)(o)
2360 self.assertEqual(size, len(o))
2361 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002362
Georg Brandl02524622010-12-02 18:06:51 +00002363 def test_read(self):
2364 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002365 with self.subTest(encoding=encoding):
2366 sin = codecs.encode(b"\x80", encoding)
2367 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2368 sout = reader.read()
2369 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002370
2371 def test_readline(self):
2372 for encoding in bytes_transform_encodings:
2373 if encoding in ['uu_codec', 'zlib_codec']:
2374 continue
Nick Coghlan8b097b42013-11-13 23:49:21 +10002375 with self.subTest(encoding=encoding):
2376 sin = codecs.encode(b"\x80", encoding)
2377 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2378 sout = reader.readline()
2379 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002380
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002381 def test_buffer_api_usage(self):
2382 # We check all the transform codecs accept memoryview input
2383 # for encoding and decoding
2384 # and also that they roundtrip correctly
2385 original = b"12345\x80"
2386 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002387 with self.subTest(encoding=encoding):
2388 data = original
2389 view = memoryview(data)
2390 data = codecs.encode(data, encoding)
2391 view_encoded = codecs.encode(view, encoding)
2392 self.assertEqual(view_encoded, data)
2393 view = memoryview(data)
2394 data = codecs.decode(data, encoding)
2395 self.assertEqual(data, original)
2396 view_decoded = codecs.decode(view, encoding)
2397 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002398
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002399 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002400 # Check binary -> binary codecs give a good error for str input
2401 bad_input = "bad input type"
2402 for encoding in bytes_transform_encodings:
2403 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002404 fmt = ( "{!r} is not a text encoding; "
2405 "use codecs.encode\(\) to handle arbitrary codecs")
2406 msg = fmt.format(encoding)
2407 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002408 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002409 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002410
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002411 def test_text_to_binary_blacklists_text_transforms(self):
2412 # Check str.encode gives a good error message for str -> str codecs
2413 msg = (r"^'rot_13' is not a text encoding; "
2414 "use codecs.encode\(\) to handle arbitrary codecs")
2415 with self.assertRaisesRegex(LookupError, msg):
2416 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002417
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002418 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002419 # Check bytes.decode and bytearray.decode give a good error
2420 # message for binary -> binary codecs
2421 data = b"encode first to ensure we meet any format restrictions"
2422 for encoding in bytes_transform_encodings:
2423 with self.subTest(encoding=encoding):
2424 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002425 fmt = (r"{!r} is not a text encoding; "
2426 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002427 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002428 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002429 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002430 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002431 bytearray(encoded_data).decode(encoding)
2432
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002433 def test_binary_to_text_blacklists_text_transforms(self):
2434 # Check str -> str codec gives a good error for binary input
2435 for bad_input in (b"immutable", bytearray(b"mutable")):
2436 with self.subTest(bad_input=bad_input):
2437 msg = (r"^'rot_13' is not a text encoding; "
2438 "use codecs.decode\(\) to handle arbitrary codecs")
2439 with self.assertRaisesRegex(LookupError, msg) as failure:
2440 bad_input.decode("rot_13")
2441 self.assertIsNone(failure.exception.__cause__)
2442
2443 def test_custom_zlib_error_is_wrapped(self):
2444 # Check zlib codec gives a good error for malformed input
2445 msg = "^decoding with 'zlib_codec' codec failed"
2446 with self.assertRaisesRegex(Exception, msg) as failure:
2447 codecs.decode(b"hello", "zlib_codec")
2448 self.assertIsInstance(failure.exception.__cause__,
2449 type(failure.exception))
2450
2451 def test_custom_hex_error_is_wrapped(self):
2452 # Check hex codec gives a good error for malformed input
2453 msg = "^decoding with 'hex_codec' codec failed"
2454 with self.assertRaisesRegex(Exception, msg) as failure:
2455 codecs.decode(b"hello", "hex_codec")
2456 self.assertIsInstance(failure.exception.__cause__,
2457 type(failure.exception))
2458
2459 # Unfortunately, the bz2 module throws OSError, which the codec
2460 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002461
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002462 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2463 def test_aliases(self):
2464 for codec_name, aliases in transform_aliases.items():
2465 expected_name = codecs.lookup(codec_name).name
2466 for alias in aliases:
2467 with self.subTest(alias=alias):
2468 info = codecs.lookup(alias)
2469 self.assertEqual(info.name, expected_name)
2470
Nick Coghlan8b097b42013-11-13 23:49:21 +10002471
2472# The codec system tries to wrap exceptions in order to ensure the error
2473# mentions the operation being performed and the codec involved. We
2474# currently *only* want this to happen for relatively stateless
2475# exceptions, where the only significant information they contain is their
2476# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002477
2478# Use a local codec registry to avoid appearing to leak objects when
2479# registering multiple seach functions
2480_TEST_CODECS = {}
2481
2482def _get_test_codec(codec_name):
2483 return _TEST_CODECS.get(codec_name)
2484codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2485
Nick Coghlan8b097b42013-11-13 23:49:21 +10002486class ExceptionChainingTest(unittest.TestCase):
2487
2488 def setUp(self):
2489 # There's no way to unregister a codec search function, so we just
2490 # ensure we render this one fairly harmless after the test
2491 # case finishes by using the test case repr as the codec name
2492 # The codecs module normalizes codec names, although this doesn't
2493 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002494 # We also make sure we use a truly unique id for the custom codec
2495 # to avoid issues with the codec cache when running these tests
2496 # multiple times (e.g. when hunting for refleaks)
2497 unique_id = repr(self) + str(id(self))
2498 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2499
2500 # We store the object to raise on the instance because of a bad
2501 # interaction between the codec caching (which means we can't
2502 # recreate the codec entry) and regrtest refleak hunting (which
2503 # runs the same test instance multiple times). This means we
2504 # need to ensure the codecs call back in to the instance to find
2505 # out which exception to raise rather than binding them in a
2506 # closure to an object that may change on the next run
2507 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002508
Nick Coghlan4e553e22013-11-16 00:35:34 +10002509 def tearDown(self):
2510 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002511
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002512 def set_codec(self, encode, decode):
2513 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002514 name=self.codec_name)
2515 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002516
2517 @contextlib.contextmanager
2518 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002519 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002520 operation, self.codec_name, exc_type.__name__, msg)
2521 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2522 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002523 self.assertIsInstance(caught.exception.__cause__, exc_type)
2524
2525 def raise_obj(self, *args, **kwds):
2526 # Helper to dynamically change the object raised by a test codec
2527 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002528
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002529 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002530 self.obj_to_raise = obj_to_raise
2531 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002532 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002533 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002534 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002535 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002536 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002537 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002538 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002539 codecs.decode(b"bytes input", self.codec_name)
2540
2541 def test_raise_by_type(self):
2542 self.check_wrapped(RuntimeError, "")
2543
2544 def test_raise_by_value(self):
2545 msg = "This should be wrapped"
2546 self.check_wrapped(RuntimeError(msg), msg)
2547
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002548 def test_raise_grandchild_subclass_exact_size(self):
2549 msg = "This should be wrapped"
2550 class MyRuntimeError(RuntimeError):
2551 __slots__ = ()
2552 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2553
2554 def test_raise_subclass_with_weakref_support(self):
2555 msg = "This should be wrapped"
2556 class MyRuntimeError(RuntimeError):
2557 pass
2558 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2559
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002560 def check_not_wrapped(self, obj_to_raise, msg):
2561 def raise_obj(*args, **kwds):
2562 raise obj_to_raise
2563 self.set_codec(raise_obj, raise_obj)
2564 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002565 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002566 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002567 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002568 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002569 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002570 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002571 codecs.decode(b"bytes input", self.codec_name)
2572
2573 def test_init_override_is_not_wrapped(self):
2574 class CustomInit(RuntimeError):
2575 def __init__(self):
2576 pass
2577 self.check_not_wrapped(CustomInit, "")
2578
2579 def test_new_override_is_not_wrapped(self):
2580 class CustomNew(RuntimeError):
2581 def __new__(cls):
2582 return super().__new__(cls)
2583 self.check_not_wrapped(CustomNew, "")
2584
2585 def test_instance_attribute_is_not_wrapped(self):
2586 msg = "This should NOT be wrapped"
2587 exc = RuntimeError(msg)
2588 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002589 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002590
2591 def test_non_str_arg_is_not_wrapped(self):
2592 self.check_not_wrapped(RuntimeError(1), "1")
2593
2594 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002595 msg_re = r"^\('a', 'b', 'c'\)$"
2596 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002597
2598 # http://bugs.python.org/issue19609
2599 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002600 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002601 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002602 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002603 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002604 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002605 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002606 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002607 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002608 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002609 codecs.decode(b"bytes input", self.codec_name)
2610
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002611 def test_unflagged_non_text_codec_handling(self):
2612 # The stdlib non-text codecs are now marked so they're
2613 # pre-emptively skipped by the text model related methods
2614 # However, third party codecs won't be flagged, so we still make
2615 # sure the case where an inappropriate output type is produced is
2616 # handled appropriately
2617 def encode_to_str(*args, **kwds):
2618 return "not bytes!", 0
2619 def decode_to_bytes(*args, **kwds):
2620 return b"not str!", 0
2621 self.set_codec(encode_to_str, decode_to_bytes)
2622 # No input or output type checks on the codecs module functions
2623 encoded = codecs.encode(None, self.codec_name)
2624 self.assertEqual(encoded, "not bytes!")
2625 decoded = codecs.decode(None, self.codec_name)
2626 self.assertEqual(decoded, b"not str!")
2627 # Text model methods should complain
2628 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2629 "use codecs.encode\(\) to encode to arbitrary types$")
2630 msg = fmt.format(self.codec_name)
2631 with self.assertRaisesRegex(TypeError, msg):
2632 "str_input".encode(self.codec_name)
2633 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2634 "use codecs.decode\(\) to decode to arbitrary types$")
2635 msg = fmt.format(self.codec_name)
2636 with self.assertRaisesRegex(TypeError, msg):
2637 b"bytes input".decode(self.codec_name)
2638
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002639
Georg Brandl02524622010-12-02 18:06:51 +00002640
Victor Stinner62be4fb2011-10-18 21:46:37 +02002641@unittest.skipUnless(sys.platform == 'win32',
2642 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002643class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002644 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002645 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002646
Victor Stinner3a50e702011-10-18 21:21:00 +02002647 def test_invalid_code_page(self):
2648 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2649 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002650 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2651 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002652
2653 def test_code_page_name(self):
2654 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2655 codecs.code_page_encode, 932, '\xff')
2656 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2657 codecs.code_page_decode, 932, b'\x81\x00')
2658 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2659 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2660
2661 def check_decode(self, cp, tests):
2662 for raw, errors, expected in tests:
2663 if expected is not None:
2664 try:
2665 decoded = codecs.code_page_decode(cp, raw, errors)
2666 except UnicodeDecodeError as err:
2667 self.fail('Unable to decode %a from "cp%s" with '
2668 'errors=%r: %s' % (raw, cp, errors, err))
2669 self.assertEqual(decoded[0], expected,
2670 '%a.decode("cp%s", %r)=%a != %a'
2671 % (raw, cp, errors, decoded[0], expected))
2672 # assert 0 <= decoded[1] <= len(raw)
2673 self.assertGreaterEqual(decoded[1], 0)
2674 self.assertLessEqual(decoded[1], len(raw))
2675 else:
2676 self.assertRaises(UnicodeDecodeError,
2677 codecs.code_page_decode, cp, raw, errors)
2678
2679 def check_encode(self, cp, tests):
2680 for text, errors, expected in tests:
2681 if expected is not None:
2682 try:
2683 encoded = codecs.code_page_encode(cp, text, errors)
2684 except UnicodeEncodeError as err:
2685 self.fail('Unable to encode %a to "cp%s" with '
2686 'errors=%r: %s' % (text, cp, errors, err))
2687 self.assertEqual(encoded[0], expected,
2688 '%a.encode("cp%s", %r)=%a != %a'
2689 % (text, cp, errors, encoded[0], expected))
2690 self.assertEqual(encoded[1], len(text))
2691 else:
2692 self.assertRaises(UnicodeEncodeError,
2693 codecs.code_page_encode, cp, text, errors)
2694
2695 def test_cp932(self):
2696 self.check_encode(932, (
2697 ('abc', 'strict', b'abc'),
2698 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002699 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002700 ('\xff', 'strict', None),
2701 ('[\xff]', 'ignore', b'[]'),
2702 ('[\xff]', 'replace', b'[y]'),
2703 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002704 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2705 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002706 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002707 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002708 (b'abc', 'strict', 'abc'),
2709 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2710 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002711 (b'[\xff]', 'strict', None),
2712 (b'[\xff]', 'ignore', '[]'),
2713 (b'[\xff]', 'replace', '[\ufffd]'),
2714 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002715 (b'\x81\x00abc', 'strict', None),
2716 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002717 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2718 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002719
2720 def test_cp1252(self):
2721 self.check_encode(1252, (
2722 ('abc', 'strict', b'abc'),
2723 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2724 ('\xff', 'strict', b'\xff'),
2725 ('\u0141', 'strict', None),
2726 ('\u0141', 'ignore', b''),
2727 ('\u0141', 'replace', b'L'),
2728 ))
2729 self.check_decode(1252, (
2730 (b'abc', 'strict', 'abc'),
2731 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2732 (b'\xff', 'strict', '\xff'),
2733 ))
2734
2735 def test_cp_utf7(self):
2736 cp = 65000
2737 self.check_encode(cp, (
2738 ('abc', 'strict', b'abc'),
2739 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2740 ('\U0010ffff', 'strict', b'+2//f/w-'),
2741 ('\udc80', 'strict', b'+3IA-'),
2742 ('\ufffd', 'strict', b'+//0-'),
2743 ))
2744 self.check_decode(cp, (
2745 (b'abc', 'strict', 'abc'),
2746 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2747 (b'+2//f/w-', 'strict', '\U0010ffff'),
2748 (b'+3IA-', 'strict', '\udc80'),
2749 (b'+//0-', 'strict', '\ufffd'),
2750 # invalid bytes
2751 (b'[+/]', 'strict', '[]'),
2752 (b'[\xff]', 'strict', '[\xff]'),
2753 ))
2754
Victor Stinner3a50e702011-10-18 21:21:00 +02002755 def test_multibyte_encoding(self):
2756 self.check_decode(932, (
2757 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2758 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2759 ))
2760 self.check_decode(self.CP_UTF8, (
2761 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2762 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2763 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002764 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002765 self.check_encode(self.CP_UTF8, (
2766 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2767 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2768 ))
2769
2770 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002771 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2772 self.assertEqual(decoded, ('', 0))
2773
Victor Stinner3a50e702011-10-18 21:21:00 +02002774 decoded = codecs.code_page_decode(932,
2775 b'\xe9\x80\xe9', 'strict',
2776 False)
2777 self.assertEqual(decoded, ('\u9a3e', 2))
2778
2779 decoded = codecs.code_page_decode(932,
2780 b'\xe9\x80\xe9\x80', 'strict',
2781 False)
2782 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2783
2784 decoded = codecs.code_page_decode(932,
2785 b'abc', 'strict',
2786 False)
2787 self.assertEqual(decoded, ('abc', 3))
2788
2789
Fred Drake2e2be372001-09-20 21:33:42 +00002790if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002791 unittest.main()