blob: 639ce9fd8522c2ff6c6fc02ceab551d88cd576be [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10003import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01004import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02005import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01006import sys
7import unittest
8import warnings
Nick Coghlanc72e4e62013-11-22 22:39:36 +10009import encodings
Victor Stinner040e16e2011-11-15 22:44:05 +010010
11from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020012
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020013if sys.platform == 'win32':
14 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
15else:
16 VISTA_OR_LATER = False
17
Antoine Pitrou00b2c862011-10-05 13:01:41 +020018try:
19 import ctypes
20except ImportError:
21 ctypes = None
22 SIZEOF_WCHAR_T = -1
23else:
24 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000025
Serhiy Storchakad6793772013-01-29 10:20:44 +020026def coding_checker(self, coder):
27 def check(input, expect):
28 self.assertEqual(coder(input), (expect, len(input)))
29 return check
30
Walter Dörwald69652032004-09-07 20:24:22 +000031class Queue(object):
32 """
33 queue: write bytes at one end, read bytes from the other end
34 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000035 def __init__(self, buffer):
36 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000037
38 def write(self, chars):
39 self._buffer += chars
40
41 def read(self, size=-1):
42 if size<0:
43 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000045 return s
46 else:
47 s = self._buffer[:size]
48 self._buffer = self._buffer[size:]
49 return s
50
Walter Dörwald3abcb012007-04-16 22:10:50 +000051class MixInCheckStateHandling:
52 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000053 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 d = codecs.getincrementaldecoder(encoding)()
55 part1 = d.decode(s[:i])
56 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000057 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # Check that the condition stated in the documentation for
59 # IncrementalDecoder.getstate() holds
60 if not state[1]:
61 # reset decoder to the default state without anything buffered
62 d.setstate((state[0][:0], 0))
63 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000064 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000065 # The decoder must return to the same state
66 self.assertEqual(state, d.getstate())
67 # Create a new decoder and set it to the state
68 # we extracted from the old one
69 d = codecs.getincrementaldecoder(encoding)()
70 d.setstate(state)
71 part2 = d.decode(s[i:], True)
72 self.assertEqual(u, part1+part2)
73
74 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000075 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000076 d = codecs.getincrementalencoder(encoding)()
77 part1 = d.encode(u[:i])
78 state = d.getstate()
79 d = codecs.getincrementalencoder(encoding)()
80 d.setstate(state)
81 part2 = d.encode(u[i:], True)
82 self.assertEqual(s, part1+part2)
83
Ezio Melotti5d3dba02013-01-11 06:02:07 +020084class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000086 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000087 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000088 # the StreamReader and check that the results equal the appropriate
89 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020091 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000095 result += r.read()
96 self.assertEqual(result, partialresult)
97 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000098 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +0000100
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 # do the check again, this time using a incremental decoder
102 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000103 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000104 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 self.assertEqual(result, partialresult)
107 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 self.assertEqual(d.decode(b"", True), "")
109 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000110
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000112 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000113 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000114 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000116 self.assertEqual(result, partialresult)
117 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 self.assertEqual(d.decode(b"", True), "")
119 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000120
121 # check iterdecode()
122 encoded = input.encode(self.encoding)
123 self.assertEqual(
124 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000125 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000126 )
127
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 def test_readline(self):
129 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000130 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 return codecs.getreader(self.encoding)(stream)
132
Walter Dörwaldca199432006-03-06 22:39:12 +0000133 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200134 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 lines = []
136 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000137 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138 if not line:
139 break
140 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000141 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000142
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
144 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
145 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000146 self.assertEqual(readalllines(s, True), sexpected)
147 self.assertEqual(readalllines(s, False), sexpectednoends)
148 self.assertEqual(readalllines(s, True, 10), sexpected)
149 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150
151 # Test long lines (multiple calls to read() in readline())
152 vw = []
153 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000154 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
155 vw.append((i*200)*"\3042" + lineend)
156 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
158 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
159
160 # Test lines where the first read might end with \r, so the
161 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000162 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000163 for lineend in "\n \r\n \r \u2028".split():
164 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000166 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000167 self.assertEqual(
168 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000169 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 )
171 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000172 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 self.assertEqual(
174 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000175 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000176 )
177
178 def test_bug1175396(self):
179 s = [
180 '<%!--===================================================\r\n',
181 ' BLOG index page: show recent articles,\r\n',
182 ' today\'s articles, or articles of a specific date.\r\n',
183 '========================================================--%>\r\n',
184 '<%@inputencoding="ISO-8859-1"%>\r\n',
185 '<%@pagetemplate=TEMPLATE.y%>\r\n',
186 '<%@import=import frog.util, frog%>\r\n',
187 '<%@import=import frog.objects%>\r\n',
188 '<%@import=from frog.storageerrors import StorageError%>\r\n',
189 '<%\r\n',
190 '\r\n',
191 'import logging\r\n',
192 'log=logging.getLogger("Snakelets.logger")\r\n',
193 '\r\n',
194 '\r\n',
195 'user=self.SessionCtx.user\r\n',
196 'storageEngine=self.SessionCtx.storageEngine\r\n',
197 '\r\n',
198 '\r\n',
199 'def readArticlesFromDate(date, count=None):\r\n',
200 ' entryids=storageEngine.listBlogEntries(date)\r\n',
201 ' entryids.reverse() # descending\r\n',
202 ' if count:\r\n',
203 ' entryids=entryids[:count]\r\n',
204 ' try:\r\n',
205 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
206 ' except StorageError,x:\r\n',
207 ' log.error("Error loading articles: "+str(x))\r\n',
208 ' self.abort("cannot load articles")\r\n',
209 '\r\n',
210 'showdate=None\r\n',
211 '\r\n',
212 'arg=self.Request.getArg()\r\n',
213 'if arg=="today":\r\n',
214 ' #-------------------- TODAY\'S ARTICLES\r\n',
215 ' self.write("<h2>Today\'s articles</h2>")\r\n',
216 ' showdate = frog.util.isodatestr() \r\n',
217 ' entries = readArticlesFromDate(showdate)\r\n',
218 'elif arg=="active":\r\n',
219 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
220 ' self.Yredirect("active.y")\r\n',
221 'elif arg=="login":\r\n',
222 ' #-------------------- LOGIN PAGE redirect\r\n',
223 ' self.Yredirect("login.y")\r\n',
224 'elif arg=="date":\r\n',
225 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
226 ' showdate = self.Request.getParameter("date")\r\n',
227 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
228 ' entries = readArticlesFromDate(showdate)\r\n',
229 'else:\r\n',
230 ' #-------------------- RECENT ARTICLES\r\n',
231 ' self.write("<h2>Recent articles</h2>")\r\n',
232 ' dates=storageEngine.listBlogEntryDates()\r\n',
233 ' if dates:\r\n',
234 ' entries=[]\r\n',
235 ' SHOWAMOUNT=10\r\n',
236 ' for showdate in dates:\r\n',
237 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
238 ' if len(entries)>=SHOWAMOUNT:\r\n',
239 ' break\r\n',
240 ' \r\n',
241 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000242 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200243 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000244 for (i, line) in enumerate(reader):
245 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000246
247 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000248 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200249 writer = codecs.getwriter(self.encoding)(q)
250 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000251
252 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000253 writer.write("foo\r")
254 self.assertEqual(reader.readline(keepends=False), "foo")
255 writer.write("\nbar\r")
256 self.assertEqual(reader.readline(keepends=False), "")
257 self.assertEqual(reader.readline(keepends=False), "bar")
258 writer.write("baz")
259 self.assertEqual(reader.readline(keepends=False), "baz")
260 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000261
262 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000263 writer.write("foo\r")
264 self.assertEqual(reader.readline(keepends=True), "foo\r")
265 writer.write("\nbar\r")
266 self.assertEqual(reader.readline(keepends=True), "\n")
267 self.assertEqual(reader.readline(keepends=True), "bar\r")
268 writer.write("baz")
269 self.assertEqual(reader.readline(keepends=True), "baz")
270 self.assertEqual(reader.readline(keepends=True), "")
271 writer.write("foo\r\n")
272 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000273
Walter Dörwald9fa09462005-01-10 12:01:39 +0000274 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000275 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
276 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
277 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000278
279 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000280 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200281 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000282 self.assertEqual(reader.readline(), s1)
283 self.assertEqual(reader.readline(), s2)
284 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000285 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000288 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
289 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
290 s3 = "stillokay:bbbbxx\r\n"
291 s4 = "broken!!!!badbad\r\n"
292 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000293
294 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000295 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200296 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000297 self.assertEqual(reader.readline(), s1)
298 self.assertEqual(reader.readline(), s2)
299 self.assertEqual(reader.readline(), s3)
300 self.assertEqual(reader.readline(), s4)
301 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000302 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000303
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200304 ill_formed_sequence_replace = "\ufffd"
305
306 def test_lone_surrogates(self):
307 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
308 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
309 "[\\udc80]".encode(self.encoding))
310 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
311 "[&#56448;]".encode(self.encoding))
312 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
313 "[]".encode(self.encoding))
314 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
315 "[?]".encode(self.encoding))
316
317 bom = "".encode(self.encoding)
318 for before, after in [("\U00010fff", "A"), ("[", "]"),
319 ("A", "\U00010fff")]:
320 before_sequence = before.encode(self.encoding)[len(bom):]
321 after_sequence = after.encode(self.encoding)[len(bom):]
322 test_string = before + "\uDC80" + after
323 test_sequence = (bom + before_sequence +
324 self.ill_formed_sequence + after_sequence)
325 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
326 self.encoding)
327 self.assertEqual(test_string.encode(self.encoding,
328 "surrogatepass"),
329 test_sequence)
330 self.assertEqual(test_sequence.decode(self.encoding,
331 "surrogatepass"),
332 test_string)
333 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
334 before + after)
335 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
336 before + self.ill_formed_sequence_replace + after)
337
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200338class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000339 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200340 if sys.byteorder == 'little':
341 ill_formed_sequence = b"\x80\xdc\x00\x00"
342 else:
343 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000344
345 spamle = (b'\xff\xfe\x00\x00'
346 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
347 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
348 spambe = (b'\x00\x00\xfe\xff'
349 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
350 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
351
352 def test_only_one_bom(self):
353 _,_,reader,writer = codecs.lookup(self.encoding)
354 # encode some stream
355 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200356 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000357 f.write("spam")
358 f.write("spam")
359 d = s.getvalue()
360 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000361 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000362 # try to read it back
363 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200364 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000365 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000366
367 def test_badbom(self):
368 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200369 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000370 self.assertRaises(UnicodeError, f.read)
371
372 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200373 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000374 self.assertRaises(UnicodeError, f.read)
375
376 def test_partial(self):
377 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200378 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000379 [
380 "", # first byte of BOM read
381 "", # second byte of BOM read
382 "", # third byte of BOM read
383 "", # fourth byte of BOM read => byteorder known
384 "",
385 "",
386 "",
387 "\x00",
388 "\x00",
389 "\x00",
390 "\x00",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff",
394 "\x00\xff",
395 "\x00\xff\u0100",
396 "\x00\xff\u0100",
397 "\x00\xff\u0100",
398 "\x00\xff\u0100",
399 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200400 "\x00\xff\u0100\uffff",
401 "\x00\xff\u0100\uffff",
402 "\x00\xff\u0100\uffff",
403 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000404 ]
405 )
406
Georg Brandl791f4e12009-09-17 11:41:24 +0000407 def test_handlers(self):
408 self.assertEqual(('\ufffd', 1),
409 codecs.utf_32_decode(b'\x01', 'replace', True))
410 self.assertEqual(('', 1),
411 codecs.utf_32_decode(b'\x01', 'ignore', True))
412
Walter Dörwald41980ca2007-08-16 21:55:45 +0000413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
415 b"\xff", "strict", True)
416
417 def test_decoder_state(self):
418 self.check_state_handling_decode(self.encoding,
419 "spamspam", self.spamle)
420 self.check_state_handling_decode(self.encoding,
421 "spamspam", self.spambe)
422
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000423 def test_issue8941(self):
424 # Issue #8941: insufficient result allocation when decoding into
425 # surrogate pairs on UCS-2 builds.
426 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
427 self.assertEqual('\U00010000' * 1024,
428 codecs.utf_32_decode(encoded_le)[0])
429 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
430 self.assertEqual('\U00010000' * 1024,
431 codecs.utf_32_decode(encoded_be)[0])
432
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200433class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000434 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200435 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000436
437 def test_partial(self):
438 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200439 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000440 [
441 "",
442 "",
443 "",
444 "\x00",
445 "\x00",
446 "\x00",
447 "\x00",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff",
451 "\x00\xff",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff",
459 "\x00\xff\u0100\uffff",
460 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000461 ]
462 )
463
464 def test_simple(self):
465 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
466
467 def test_errors(self):
468 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
469 b"\xff", "strict", True)
470
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000471 def test_issue8941(self):
472 # Issue #8941: insufficient result allocation when decoding into
473 # surrogate pairs on UCS-2 builds.
474 encoded = b'\x00\x00\x01\x00' * 1024
475 self.assertEqual('\U00010000' * 1024,
476 codecs.utf_32_le_decode(encoded)[0])
477
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200478class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000479 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200480 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000481
482 def test_partial(self):
483 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200484 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485 [
486 "",
487 "",
488 "",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000506 ]
507 )
508
509 def test_simple(self):
510 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
511
512 def test_errors(self):
513 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
514 b"\xff", "strict", True)
515
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000516 def test_issue8941(self):
517 # Issue #8941: insufficient result allocation when decoding into
518 # surrogate pairs on UCS-2 builds.
519 encoded = b'\x00\x01\x00\x00' * 1024
520 self.assertEqual('\U00010000' * 1024,
521 codecs.utf_32_be_decode(encoded)[0])
522
523
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200524class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200526 if sys.byteorder == 'little':
527 ill_formed_sequence = b"\x80\xdc"
528 else:
529 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000530
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000531 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
532 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000533
534 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000535 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000536 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000537 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200538 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 f.write("spam")
540 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000541 d = s.getvalue()
542 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000543 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000544 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000545 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200546 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000547 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000548
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000549 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000550 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200551 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000552 self.assertRaises(UnicodeError, f.read)
553
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000554 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200555 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000556 self.assertRaises(UnicodeError, f.read)
557
Walter Dörwald69652032004-09-07 20:24:22 +0000558 def test_partial(self):
559 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200560 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000561 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000562 "", # first byte of BOM read
563 "", # second byte of BOM read => byteorder known
564 "",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100",
571 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff",
575 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 ]
577 )
578
Georg Brandl791f4e12009-09-17 11:41:24 +0000579 def test_handlers(self):
580 self.assertEqual(('\ufffd', 1),
581 codecs.utf_16_decode(b'\x01', 'replace', True))
582 self.assertEqual(('', 1),
583 codecs.utf_16_decode(b'\x01', 'ignore', True))
584
Walter Dörwalde22d3392005-11-17 08:52:34 +0000585 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000586 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000587 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000588
589 def test_decoder_state(self):
590 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000591 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000592 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000594
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000595 def test_bug691291(self):
596 # Files are always opened in binary mode, even if no binary mode was
597 # specified. This means that no automatic conversion of '\n' is done
598 # on reading and writing.
599 s1 = 'Hello\r\nworld\r\n'
600
601 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200602 self.addCleanup(support.unlink, support.TESTFN)
603 with open(support.TESTFN, 'wb') as fp:
604 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200605 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200606 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000607
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200608class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000609 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200610 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000611
612 def test_partial(self):
613 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200614 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000615 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000616 "",
617 "\x00",
618 "\x00",
619 "\x00\xff",
620 "\x00\xff",
621 "\x00\xff\u0100",
622 "\x00\xff\u0100",
623 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200624 "\x00\xff\u0100\uffff",
625 "\x00\xff\u0100\uffff",
626 "\x00\xff\u0100\uffff",
627 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000628 ]
629 )
630
Walter Dörwalde22d3392005-11-17 08:52:34 +0000631 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200632 tests = [
633 (b'\xff', '\ufffd'),
634 (b'A\x00Z', 'A\ufffd'),
635 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
636 (b'\x00\xd8', '\ufffd'),
637 (b'\x00\xd8A', '\ufffd'),
638 (b'\x00\xd8A\x00', '\ufffdA'),
639 (b'\x00\xdcA\x00', '\ufffdA'),
640 ]
641 for raw, expected in tests:
642 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
643 raw, 'strict', True)
644 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000645
Victor Stinner53a9dd72010-12-08 22:25:45 +0000646 def test_nonbmp(self):
647 self.assertEqual("\U00010203".encode(self.encoding),
648 b'\x00\xd8\x03\xde')
649 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
650 "\U00010203")
651
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200652class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000653 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200654 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000655
656 def test_partial(self):
657 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200658 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000659 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000660 "",
661 "\x00",
662 "\x00",
663 "\x00\xff",
664 "\x00\xff",
665 "\x00\xff\u0100",
666 "\x00\xff\u0100",
667 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200668 "\x00\xff\u0100\uffff",
669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff",
671 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000672 ]
673 )
674
Walter Dörwalde22d3392005-11-17 08:52:34 +0000675 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200676 tests = [
677 (b'\xff', '\ufffd'),
678 (b'\x00A\xff', 'A\ufffd'),
679 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
680 (b'\xd8\x00', '\ufffd'),
681 (b'\xd8\x00\xdc', '\ufffd'),
682 (b'\xd8\x00\x00A', '\ufffdA'),
683 (b'\xdc\x00\x00A', '\ufffdA'),
684 ]
685 for raw, expected in tests:
686 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
687 raw, 'strict', True)
688 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000689
Victor Stinner53a9dd72010-12-08 22:25:45 +0000690 def test_nonbmp(self):
691 self.assertEqual("\U00010203".encode(self.encoding),
692 b'\xd8\x00\xde\x03')
693 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
694 "\U00010203")
695
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200696class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000697 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200698 ill_formed_sequence = b"\xed\xb2\x80"
699 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000700
701 def test_partial(self):
702 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200703 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000704 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000705 "\x00",
706 "\x00",
707 "\x00\xff",
708 "\x00\xff",
709 "\x00\xff\u07ff",
710 "\x00\xff\u07ff",
711 "\x00\xff\u07ff",
712 "\x00\xff\u07ff\u0800",
713 "\x00\xff\u07ff\u0800",
714 "\x00\xff\u07ff\u0800",
715 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200716 "\x00\xff\u07ff\u0800\uffff",
717 "\x00\xff\u07ff\u0800\uffff",
718 "\x00\xff\u07ff\u0800\uffff",
719 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000720 ]
721 )
722
Walter Dörwald3abcb012007-04-16 22:10:50 +0000723 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000724 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000725 self.check_state_handling_decode(self.encoding,
726 u, u.encode(self.encoding))
727
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000728 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200729 super().test_lone_surrogates()
730 # not sure if this is making sense for
731 # UTF-16 and UTF-32
732 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000733 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000734
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000735 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000736 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
737 b"abc\xed\xa0\x80def")
738 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
739 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
741 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
742 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
743 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000744 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700745 with self.assertRaises(UnicodeDecodeError):
746 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200747 with self.assertRaises(UnicodeDecodeError):
748 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000749
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200750@unittest.skipUnless(sys.platform == 'win32',
751 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200752class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200753 encoding = "cp65001"
754
755 def test_encode(self):
756 tests = [
757 ('abc', 'strict', b'abc'),
758 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
759 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
760 ]
761 if VISTA_OR_LATER:
762 tests.extend((
763 ('\udc80', 'strict', None),
764 ('\udc80', 'ignore', b''),
765 ('\udc80', 'replace', b'?'),
766 ('\udc80', 'backslashreplace', b'\\udc80'),
767 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
768 ))
769 else:
770 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
771 for text, errors, expected in tests:
772 if expected is not None:
773 try:
774 encoded = text.encode('cp65001', errors)
775 except UnicodeEncodeError as err:
776 self.fail('Unable to encode %a to cp65001 with '
777 'errors=%r: %s' % (text, errors, err))
778 self.assertEqual(encoded, expected,
779 '%a.encode("cp65001", %r)=%a != %a'
780 % (text, errors, encoded, expected))
781 else:
782 self.assertRaises(UnicodeEncodeError,
783 text.encode, "cp65001", errors)
784
785 def test_decode(self):
786 tests = [
787 (b'abc', 'strict', 'abc'),
788 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
789 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
790 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
791 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
792 # invalid bytes
793 (b'[\xff]', 'strict', None),
794 (b'[\xff]', 'ignore', '[]'),
795 (b'[\xff]', 'replace', '[\ufffd]'),
796 (b'[\xff]', 'surrogateescape', '[\udcff]'),
797 ]
798 if VISTA_OR_LATER:
799 tests.extend((
800 (b'[\xed\xb2\x80]', 'strict', None),
801 (b'[\xed\xb2\x80]', 'ignore', '[]'),
802 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
803 ))
804 else:
805 tests.extend((
806 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
807 ))
808 for raw, errors, expected in tests:
809 if expected is not None:
810 try:
811 decoded = raw.decode('cp65001', errors)
812 except UnicodeDecodeError as err:
813 self.fail('Unable to decode %a from cp65001 with '
814 'errors=%r: %s' % (raw, errors, err))
815 self.assertEqual(decoded, expected,
816 '%a.decode("cp65001", %r)=%a != %a'
817 % (raw, errors, decoded, expected))
818 else:
819 self.assertRaises(UnicodeDecodeError,
820 raw.decode, 'cp65001', errors)
821
822 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
823 def test_lone_surrogates(self):
824 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
825 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
826 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
827 b'[\\udc80]')
828 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
829 b'[&#56448;]')
830 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
831 b'[\x80]')
832 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
833 b'[]')
834 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
835 b'[?]')
836
837 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
838 def test_surrogatepass_handler(self):
839 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
840 b"abc\xed\xa0\x80def")
841 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
842 "abc\ud800def")
843 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
844 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
845 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
846 "\U00010fff\uD800")
847 self.assertTrue(codecs.lookup_error("surrogatepass"))
848
849
850
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200851class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000852 encoding = "utf-7"
853
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000854 def test_partial(self):
855 self.check_partial(
856 "a+-b",
857 [
858 "a",
859 "a",
860 "a+",
861 "a+-",
862 "a+-b",
863 ]
864 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000865
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300866 def test_errors(self):
867 tests = [
868 (b'a\xffb', 'a\ufffdb'),
869 (b'a+IK', 'a\ufffd'),
870 (b'a+IK-b', 'a\ufffdb'),
871 (b'a+IK,b', 'a\ufffdb'),
872 (b'a+IKx', 'a\u20ac\ufffd'),
873 (b'a+IKx-b', 'a\u20ac\ufffdb'),
874 (b'a+IKwgr', 'a\u20ac\ufffd'),
875 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
876 (b'a+IKwgr,', 'a\u20ac\ufffd'),
877 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
878 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
879 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
880 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
881 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
882 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
883 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
884 ]
885 for raw, expected in tests:
886 with self.subTest(raw=raw):
887 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
888 raw, 'strict', True)
889 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
890
891 def test_nonbmp(self):
892 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
893 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
894 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
895
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200896 test_lone_surrogates = None
897
898
Walter Dörwalde22d3392005-11-17 08:52:34 +0000899class UTF16ExTest(unittest.TestCase):
900
901 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000902 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000903
904 def test_bad_args(self):
905 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
906
907class ReadBufferTest(unittest.TestCase):
908
909 def test_array(self):
910 import array
911 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000912 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000913 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000914 )
915
916 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000917 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000918
919 def test_bad_args(self):
920 self.assertRaises(TypeError, codecs.readbuffer_encode)
921 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
922
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200923class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000924 encoding = "utf-8-sig"
925
926 def test_partial(self):
927 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200928 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000929 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000930 "",
931 "",
932 "", # First BOM has been read and skipped
933 "",
934 "",
935 "\ufeff", # Second BOM has been read and emitted
936 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000937 "\ufeff\x00", # First byte of encoded "\xff" read
938 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
939 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
940 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000941 "\ufeff\x00\xff\u07ff",
942 "\ufeff\x00\xff\u07ff",
943 "\ufeff\x00\xff\u07ff\u0800",
944 "\ufeff\x00\xff\u07ff\u0800",
945 "\ufeff\x00\xff\u07ff\u0800",
946 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200947 "\ufeff\x00\xff\u07ff\u0800\uffff",
948 "\ufeff\x00\xff\u07ff\u0800\uffff",
949 "\ufeff\x00\xff\u07ff\u0800\uffff",
950 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000951 ]
952 )
953
Thomas Wouters89f507f2006-12-13 04:49:30 +0000954 def test_bug1601501(self):
955 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000956 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000957
Walter Dörwald3abcb012007-04-16 22:10:50 +0000958 def test_bom(self):
959 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000960 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000961 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
962
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000963 def test_stream_bom(self):
964 unistring = "ABC\u00A1\u2200XYZ"
965 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
966
967 reader = codecs.getreader("utf-8-sig")
968 for sizehint in [None] + list(range(1, 11)) + \
969 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200970 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000971 ostream = io.StringIO()
972 while 1:
973 if sizehint is not None:
974 data = istream.read(sizehint)
975 else:
976 data = istream.read()
977
978 if not data:
979 break
980 ostream.write(data)
981
982 got = ostream.getvalue()
983 self.assertEqual(got, unistring)
984
985 def test_stream_bare(self):
986 unistring = "ABC\u00A1\u2200XYZ"
987 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
988
989 reader = codecs.getreader("utf-8-sig")
990 for sizehint in [None] + list(range(1, 11)) + \
991 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200992 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000993 ostream = io.StringIO()
994 while 1:
995 if sizehint is not None:
996 data = istream.read(sizehint)
997 else:
998 data = istream.read()
999
1000 if not data:
1001 break
1002 ostream.write(data)
1003
1004 got = ostream.getvalue()
1005 self.assertEqual(got, unistring)
1006
1007class EscapeDecodeTest(unittest.TestCase):
1008 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001009 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001010
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001011 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001012 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001013 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001014 b = bytes([b])
1015 if b != b'\\':
1016 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001017
1018 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001019 decode = codecs.escape_decode
1020 check = coding_checker(self, decode)
1021 check(b"[\\\n]", b"[]")
1022 check(br'[\"]', b'["]')
1023 check(br"[\']", b"[']")
1024 check(br"[\\]", br"[\]")
1025 check(br"[\a]", b"[\x07]")
1026 check(br"[\b]", b"[\x08]")
1027 check(br"[\t]", b"[\x09]")
1028 check(br"[\n]", b"[\x0a]")
1029 check(br"[\v]", b"[\x0b]")
1030 check(br"[\f]", b"[\x0c]")
1031 check(br"[\r]", b"[\x0d]")
1032 check(br"[\7]", b"[\x07]")
1033 check(br"[\8]", br"[\8]")
1034 check(br"[\78]", b"[\x078]")
1035 check(br"[\41]", b"[!]")
1036 check(br"[\418]", b"[!8]")
1037 check(br"[\101]", b"[A]")
1038 check(br"[\1010]", b"[A0]")
1039 check(br"[\501]", b"[A]")
1040 check(br"[\x41]", b"[A]")
1041 check(br"[\X41]", br"[\X41]")
1042 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001043 for b in range(256):
1044 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001045 b = bytes([b])
1046 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001047
1048 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001049 decode = codecs.escape_decode
1050 self.assertRaises(ValueError, decode, br"\x")
1051 self.assertRaises(ValueError, decode, br"[\x]")
1052 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1053 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1054 self.assertRaises(ValueError, decode, br"\x0")
1055 self.assertRaises(ValueError, decode, br"[\x0]")
1056 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1057 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001058
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001059class RecodingTest(unittest.TestCase):
1060 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001061 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001062 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001063 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001064 f2.close()
1065 # Python used to crash on this at exit because of a refcount
1066 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001067
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068# From RFC 3492
1069punycode_testcases = [
1070 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001071 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1072 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001073 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001075 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001076 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001077 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001078 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001079 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001081 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1082 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1083 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001084 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001086 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1087 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1088 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001089 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001090 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001091 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001092 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1093 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1094 "\u0939\u0948\u0902",
1095 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001096
1097 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001098 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001099 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1100 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101
1102 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001103 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1104 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1105 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001106 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1107 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108
1109 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001110 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1111 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1112 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1113 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001114 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115
1116 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001117 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1118 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1119 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1120 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1121 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001122 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123
1124 # (K) Vietnamese:
1125 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1126 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001127 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1128 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1129 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1130 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001131 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001132
Martin v. Löwis2548c732003-04-18 10:39:54 +00001133 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001134 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001135 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001136
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001138 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1139 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1140 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001141 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142
1143 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001144 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1145 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1146 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001147 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148
1149 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001150 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001151 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152
1153 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001154 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1155 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001156 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157
1158 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001159 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001160 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161
1162 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001163 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001164 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001165
1166 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1168 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001169 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170 ]
1171
1172for i in punycode_testcases:
1173 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001174 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175
1176class PunycodeTest(unittest.TestCase):
1177 def test_encode(self):
1178 for uni, puny in punycode_testcases:
1179 # Need to convert both strings to lower case, since
1180 # some of the extended encodings use upper case, but our
1181 # code produces only lower case. Converting just puny to
1182 # lower is also insufficient, since some of the input characters
1183 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001184 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001185 str(uni.encode("punycode"), "ascii").lower(),
1186 str(puny, "ascii").lower()
1187 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001188
1189 def test_decode(self):
1190 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001191 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001192 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001193 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001194
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001195class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001196 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001197 def test_bug1251300(self):
1198 # Decoding with unicode_internal used to not correctly handle "code
1199 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001200 ok = [
1201 (b"\x00\x10\xff\xff", "\U0010ffff"),
1202 (b"\x00\x00\x01\x01", "\U00000101"),
1203 (b"", ""),
1204 ]
1205 not_ok = [
1206 b"\x7f\xff\xff\xff",
1207 b"\x80\x00\x00\x00",
1208 b"\x81\x00\x00\x00",
1209 b"\x00",
1210 b"\x00\x00\x00\x00\x00",
1211 ]
1212 for internal, uni in ok:
1213 if sys.byteorder == "little":
1214 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001215 with support.check_warnings():
1216 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001217 for internal in not_ok:
1218 if sys.byteorder == "little":
1219 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001220 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001221 'deprecated', DeprecationWarning)):
1222 self.assertRaises(UnicodeDecodeError, internal.decode,
1223 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001224 if sys.byteorder == "little":
1225 invalid = b"\x00\x00\x11\x00"
1226 else:
1227 invalid = b"\x00\x11\x00\x00"
1228 with support.check_warnings():
1229 self.assertRaises(UnicodeDecodeError,
1230 invalid.decode, "unicode_internal")
1231 with support.check_warnings():
1232 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1233 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001234
Victor Stinner182d90d2011-09-29 19:53:55 +02001235 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001236 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001237 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001238 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001239 'deprecated', DeprecationWarning)):
1240 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001241 except UnicodeDecodeError as ex:
1242 self.assertEqual("unicode_internal", ex.encoding)
1243 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1244 self.assertEqual(4, ex.start)
1245 self.assertEqual(8, ex.end)
1246 else:
1247 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001248
Victor Stinner182d90d2011-09-29 19:53:55 +02001249 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001250 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001251 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1252 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001253 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001254 'deprecated', DeprecationWarning)):
1255 ab = "ab".encode("unicode_internal").decode()
1256 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1257 "ascii"),
1258 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001259 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001260
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001261 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001262 with support.check_warnings(('unicode_internal codec has been '
1263 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001264 # Issue 3739
1265 encoder = codecs.getencoder("unicode_internal")
1266 self.assertEqual(encoder("a")[1], 1)
1267 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1268
1269 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001270
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1272nameprep_tests = [
1273 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001274 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1275 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1276 b'\xb8\x8f\xef\xbb\xbf',
1277 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001279 (b'CAFE',
1280 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281 # 3.3 Case folding 8bit U+00DF (german sharp s).
1282 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001283 (b'\xc3\x9f',
1284 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001285 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001286 (b'\xc4\xb0',
1287 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001288 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001289 (b'\xc5\x83\xcd\xba',
1290 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001291 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1292 # XXX: skip this as it fails in UCS-2 mode
1293 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1294 # 'telc\xe2\x88\x95kg\xcf\x83'),
1295 (None, None),
1296 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001297 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1298 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001299 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001300 (b'\xe1\xbe\xb7',
1301 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001302 # 3.9 Self-reverting case folding U+01F0 and normalization.
1303 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001304 (b'\xc7\xb0',
1305 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001306 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001307 (b'\xce\x90',
1308 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001309 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001310 (b'\xce\xb0',
1311 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001312 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001313 (b'\xe1\xba\x96',
1314 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001315 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001316 (b'\xe1\xbd\x96',
1317 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001318 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 (b' ',
1320 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001322 (b'\xc2\xa0',
1323 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001324 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001325 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001326 None),
1327 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001328 (b'\xe2\x80\x80',
1329 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001330 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001331 (b'\xe2\x80\x8b',
1332 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001333 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001334 (b'\xe3\x80\x80',
1335 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001336 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001337 (b'\x10\x7f',
1338 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001339 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001340 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001341 None),
1342 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001343 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001344 None),
1345 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001346 (b'\xef\xbb\xbf',
1347 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001349 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001350 None),
1351 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001352 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001353 None),
1354 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001355 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001356 None),
1357 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001358 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001359 None),
1360 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001361 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001362 None),
1363 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001364 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001365 None),
1366 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001367 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001368 None),
1369 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001370 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001371 None),
1372 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001374 None),
1375 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001376 (b'\xcd\x81',
1377 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001379 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001380 None),
1381 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001382 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001383 None),
1384 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001385 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001386 None),
1387 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001388 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001389 None),
1390 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001391 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001392 None),
1393 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001394 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001395 None),
1396 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001397 (b'foo\xef\xb9\xb6bar',
1398 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001399 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001400 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001401 None),
1402 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001403 (b'\xd8\xa71\xd8\xa8',
1404 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001406 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001408 # None),
1409 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001410 # 3.44 Larger test (shrinking).
1411 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001412 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1413 b'\xaa\xce\xb0\xe2\x80\x80',
1414 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001415 # 3.45 Larger test (expanding).
1416 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001417 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1418 b'\x80',
1419 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1420 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1421 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001422 ]
1423
1424
1425class NameprepTest(unittest.TestCase):
1426 def test_nameprep(self):
1427 from encodings.idna import nameprep
1428 for pos, (orig, prepped) in enumerate(nameprep_tests):
1429 if orig is None:
1430 # Skipped
1431 continue
1432 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001433 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001434 if prepped is None:
1435 # Input contains prohibited characters
1436 self.assertRaises(UnicodeError, nameprep, orig)
1437 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001438 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001439 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001440 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001441 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001442 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001443
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001444class IDNACodecTest(unittest.TestCase):
1445 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001446 self.assertEqual(str(b"python.org", "idna"), "python.org")
1447 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1448 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1449 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001450
1451 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001452 self.assertEqual("python.org".encode("idna"), b"python.org")
1453 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1454 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1455 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001456
Martin v. Löwis8b595142005-08-25 11:03:38 +00001457 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001458 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001459 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001460 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001461
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001462 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001463 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001464 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001465 "python.org"
1466 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001467 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001468 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001469 "python.org."
1470 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001471 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001472 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001473 "pyth\xf6n.org."
1474 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001475 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001476 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001477 "pyth\xf6n.org."
1478 )
1479
1480 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001481 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1482 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1483 self.assertEqual(decoder.decode(b"rg"), "")
1484 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001485
1486 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001487 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1488 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1489 self.assertEqual(decoder.decode(b"rg."), "org.")
1490 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001491
1492 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001493 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001494 b"".join(codecs.iterencode("python.org", "idna")),
1495 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001497 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001498 b"".join(codecs.iterencode("python.org.", "idna")),
1499 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001500 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001501 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001502 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1503 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001504 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001505 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001506 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1507 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001508 )
1509
1510 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001511 self.assertEqual(encoder.encode("\xe4x"), b"")
1512 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1513 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001514
1515 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001516 self.assertEqual(encoder.encode("\xe4x"), b"")
1517 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1518 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001519
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001520class CodecsModuleTest(unittest.TestCase):
1521
1522 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1524 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001525 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001526 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001527 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001528
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001529 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001530 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1531 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001532 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001533 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001534 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001535 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001536
1537 def test_register(self):
1538 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001539 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001540
1541 def test_lookup(self):
1542 self.assertRaises(TypeError, codecs.lookup)
1543 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001544 self.assertRaises(LookupError, codecs.lookup, " ")
1545
1546 def test_getencoder(self):
1547 self.assertRaises(TypeError, codecs.getencoder)
1548 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1549
1550 def test_getdecoder(self):
1551 self.assertRaises(TypeError, codecs.getdecoder)
1552 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1553
1554 def test_getreader(self):
1555 self.assertRaises(TypeError, codecs.getreader)
1556 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1557
1558 def test_getwriter(self):
1559 self.assertRaises(TypeError, codecs.getwriter)
1560 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001561
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001562 def test_lookup_issue1813(self):
1563 # Issue #1813: under Turkish locales, lookup of some codecs failed
1564 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001565 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001566 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1567 try:
1568 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1569 except locale.Error:
1570 # Unsupported locale on this system
1571 self.skipTest('test needs Turkish locale')
1572 c = codecs.lookup('ASCII')
1573 self.assertEqual(c.name, 'ascii')
1574
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001575class StreamReaderTest(unittest.TestCase):
1576
1577 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001578 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001579 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001580
1581 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001582 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001583 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001584
Thomas Wouters89f507f2006-12-13 04:49:30 +00001585class EncodedFileTest(unittest.TestCase):
1586
1587 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001588 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001589 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001590 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001591
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001592 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001593 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001594 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001595 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001596
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001597all_unicode_encodings = [
1598 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001599 "big5",
1600 "big5hkscs",
1601 "charmap",
1602 "cp037",
1603 "cp1006",
1604 "cp1026",
Serhiy Storchakabe0c3252013-11-23 18:52:23 +02001605 "cp1125",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001606 "cp1140",
1607 "cp1250",
1608 "cp1251",
1609 "cp1252",
1610 "cp1253",
1611 "cp1254",
1612 "cp1255",
1613 "cp1256",
1614 "cp1257",
1615 "cp1258",
1616 "cp424",
1617 "cp437",
1618 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001619 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001620 "cp737",
1621 "cp775",
1622 "cp850",
1623 "cp852",
1624 "cp855",
1625 "cp856",
1626 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001627 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001628 "cp860",
1629 "cp861",
1630 "cp862",
1631 "cp863",
1632 "cp864",
1633 "cp865",
1634 "cp866",
1635 "cp869",
1636 "cp874",
1637 "cp875",
1638 "cp932",
1639 "cp949",
1640 "cp950",
1641 "euc_jis_2004",
1642 "euc_jisx0213",
1643 "euc_jp",
1644 "euc_kr",
1645 "gb18030",
1646 "gb2312",
1647 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001648 "hp_roman8",
1649 "hz",
1650 "idna",
1651 "iso2022_jp",
1652 "iso2022_jp_1",
1653 "iso2022_jp_2",
1654 "iso2022_jp_2004",
1655 "iso2022_jp_3",
1656 "iso2022_jp_ext",
1657 "iso2022_kr",
1658 "iso8859_1",
1659 "iso8859_10",
1660 "iso8859_11",
1661 "iso8859_13",
1662 "iso8859_14",
1663 "iso8859_15",
1664 "iso8859_16",
1665 "iso8859_2",
1666 "iso8859_3",
1667 "iso8859_4",
1668 "iso8859_5",
1669 "iso8859_6",
1670 "iso8859_7",
1671 "iso8859_8",
1672 "iso8859_9",
1673 "johab",
1674 "koi8_r",
1675 "koi8_u",
1676 "latin_1",
1677 "mac_cyrillic",
1678 "mac_greek",
1679 "mac_iceland",
1680 "mac_latin2",
1681 "mac_roman",
1682 "mac_turkish",
1683 "palmos",
1684 "ptcp154",
1685 "punycode",
1686 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001687 "shift_jis",
1688 "shift_jis_2004",
1689 "shift_jisx0213",
1690 "tis_620",
1691 "unicode_escape",
1692 "unicode_internal",
1693 "utf_16",
1694 "utf_16_be",
1695 "utf_16_le",
1696 "utf_7",
1697 "utf_8",
1698]
1699
1700if hasattr(codecs, "mbcs_encode"):
1701 all_unicode_encodings.append("mbcs")
1702
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001703# The following encoding is not tested, because it's not supposed
1704# to work:
1705# "undefined"
1706
1707# The following encodings don't work in stateful mode
1708broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001709 "punycode",
1710 "unicode_internal"
1711]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001712broken_incremental_coders = broken_unicode_with_streams + [
1713 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001714]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001715
Walter Dörwald3abcb012007-04-16 22:10:50 +00001716class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001717 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001718 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001719 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001720 name = codecs.lookup(encoding).name
1721 if encoding.endswith("_codec"):
1722 name += "_codec"
1723 elif encoding == "latin_1":
1724 name = "latin_1"
1725 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001726
Ezio Melottiadc417c2011-11-17 12:23:34 +02001727 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001728 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001729 (b, size) = codecs.getencoder(encoding)(s)
1730 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1731 (chars, size) = codecs.getdecoder(encoding)(b)
1732 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001733
1734 if encoding not in broken_unicode_with_streams:
1735 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001736 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001737 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001738 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001739 for c in s:
1740 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001741 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001742 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001743 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001744 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001745 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001746 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001747 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001748 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001749 decodedresult += reader.read()
1750 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1751
Thomas Wouters89f507f2006-12-13 04:49:30 +00001752 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001753 # check incremental decoder/encoder (fetched via the Python
1754 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001755 try:
1756 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001757 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001758 except LookupError: # no IncrementalEncoder
1759 pass
1760 else:
1761 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001762 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001763 for c in s:
1764 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001765 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001766 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001767 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001768 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001769 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001770 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001771 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1772
1773 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001774 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001775 for c in s:
1776 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001777 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001778 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001779 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001780 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001781 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001782 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001783 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1784
1785 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001786 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001787 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1788
1789 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001790 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1791 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001792
Victor Stinner554f3f02010-06-16 23:33:54 +00001793 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001794 # check incremental decoder/encoder with errors argument
1795 try:
1796 encoder = codecs.getincrementalencoder(encoding)("ignore")
1797 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1798 except LookupError: # no IncrementalEncoder
1799 pass
1800 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001801 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001802 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001803 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001804 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1805
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001806 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001807 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001808 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001809 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1810
Walter Dörwald729c31f2005-03-14 19:06:30 +00001811 def test_seek(self):
1812 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001813 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001814 for encoding in all_unicode_encodings:
1815 if encoding == "idna": # FIXME: See SF bug #1163178
1816 continue
1817 if encoding in broken_unicode_with_streams:
1818 continue
Victor Stinner05010702011-05-27 16:50:40 +02001819 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001820 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001821 # Test that calling seek resets the internal codec state and buffers
1822 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001823 data = reader.read()
1824 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001825
Walter Dörwalde22d3392005-11-17 08:52:34 +00001826 def test_bad_decode_args(self):
1827 for encoding in all_unicode_encodings:
1828 decoder = codecs.getdecoder(encoding)
1829 self.assertRaises(TypeError, decoder)
1830 if encoding not in ("idna", "punycode"):
1831 self.assertRaises(TypeError, decoder, 42)
1832
1833 def test_bad_encode_args(self):
1834 for encoding in all_unicode_encodings:
1835 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001836 with support.check_warnings():
1837 # unicode-internal has been deprecated
1838 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001839
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001840 def test_encoding_map_type_initialized(self):
1841 from encodings import cp1140
1842 # This used to crash, we are only verifying there's no crash.
1843 table_type = type(cp1140.encoding_table)
1844 self.assertEqual(table_type, table_type)
1845
Walter Dörwald3abcb012007-04-16 22:10:50 +00001846 def test_decoder_state(self):
1847 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001848 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001849 for encoding in all_unicode_encodings:
1850 if encoding not in broken_incremental_coders:
1851 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1852 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1853
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001854class CharmapTest(unittest.TestCase):
1855 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001856 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001857 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001858 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001859 )
1860
Ezio Melottib3aedd42010-11-20 19:04:17 +00001861 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001862 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1863 ("\U0010FFFFbc", 3)
1864 )
1865
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001866 self.assertRaises(UnicodeDecodeError,
1867 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1868 )
1869
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001870 self.assertRaises(UnicodeDecodeError,
1871 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1872 )
1873
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001874 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001875 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001876 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001877 )
1878
Ezio Melottib3aedd42010-11-20 19:04:17 +00001879 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001880 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001881 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001882 )
1883
Ezio Melottib3aedd42010-11-20 19:04:17 +00001884 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001885 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001886 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001887 )
1888
Ezio Melottib3aedd42010-11-20 19:04:17 +00001889 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001890 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001891 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001892 )
1893
Guido van Rossum805365e2007-05-07 22:24:25 +00001894 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001895 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001896 codecs.charmap_decode(allbytes, "ignore", ""),
1897 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001898 )
1899
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001900 def test_decode_with_int2str_map(self):
1901 self.assertEqual(
1902 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1903 {0: 'a', 1: 'b', 2: 'c'}),
1904 ("abc", 3)
1905 )
1906
1907 self.assertEqual(
1908 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1909 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1910 ("AaBbCc", 3)
1911 )
1912
1913 self.assertEqual(
1914 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1915 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1916 ("\U0010FFFFbc", 3)
1917 )
1918
1919 self.assertEqual(
1920 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1921 {0: 'a', 1: 'b', 2: ''}),
1922 ("ab", 3)
1923 )
1924
1925 self.assertRaises(UnicodeDecodeError,
1926 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1927 {0: 'a', 1: 'b'}
1928 )
1929
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001930 self.assertRaises(UnicodeDecodeError,
1931 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1932 {0: 'a', 1: 'b', 2: None}
1933 )
1934
1935 # Issue #14850
1936 self.assertRaises(UnicodeDecodeError,
1937 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1938 {0: 'a', 1: 'b', 2: '\ufffe'}
1939 )
1940
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001941 self.assertEqual(
1942 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1943 {0: 'a', 1: 'b'}),
1944 ("ab\ufffd", 3)
1945 )
1946
1947 self.assertEqual(
1948 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1949 {0: 'a', 1: 'b', 2: None}),
1950 ("ab\ufffd", 3)
1951 )
1952
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001953 # Issue #14850
1954 self.assertEqual(
1955 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1956 {0: 'a', 1: 'b', 2: '\ufffe'}),
1957 ("ab\ufffd", 3)
1958 )
1959
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001960 self.assertEqual(
1961 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1962 {0: 'a', 1: 'b'}),
1963 ("ab", 3)
1964 )
1965
1966 self.assertEqual(
1967 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1968 {0: 'a', 1: 'b', 2: None}),
1969 ("ab", 3)
1970 )
1971
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001972 # Issue #14850
1973 self.assertEqual(
1974 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1975 {0: 'a', 1: 'b', 2: '\ufffe'}),
1976 ("ab", 3)
1977 )
1978
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001979 allbytes = bytes(range(256))
1980 self.assertEqual(
1981 codecs.charmap_decode(allbytes, "ignore", {}),
1982 ("", len(allbytes))
1983 )
1984
1985 def test_decode_with_int2int_map(self):
1986 a = ord('a')
1987 b = ord('b')
1988 c = ord('c')
1989
1990 self.assertEqual(
1991 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1992 {0: a, 1: b, 2: c}),
1993 ("abc", 3)
1994 )
1995
1996 # Issue #15379
1997 self.assertEqual(
1998 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1999 {0: 0x10FFFF, 1: b, 2: c}),
2000 ("\U0010FFFFbc", 3)
2001 )
2002
Antoine Pitroua1f76552012-09-23 20:00:04 +02002003 self.assertEqual(
2004 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2005 {0: sys.maxunicode, 1: b, 2: c}),
2006 (chr(sys.maxunicode) + "bc", 3)
2007 )
2008
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002009 self.assertRaises(TypeError,
2010 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002011 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002012 )
2013
2014 self.assertRaises(UnicodeDecodeError,
2015 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2016 {0: a, 1: b},
2017 )
2018
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002019 self.assertRaises(UnicodeDecodeError,
2020 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2021 {0: a, 1: b, 2: 0xFFFE},
2022 )
2023
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002024 self.assertEqual(
2025 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2026 {0: a, 1: b}),
2027 ("ab\ufffd", 3)
2028 )
2029
2030 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002031 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2032 {0: a, 1: b, 2: 0xFFFE}),
2033 ("ab\ufffd", 3)
2034 )
2035
2036 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002037 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2038 {0: a, 1: b}),
2039 ("ab", 3)
2040 )
2041
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002042 self.assertEqual(
2043 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2044 {0: a, 1: b, 2: 0xFFFE}),
2045 ("ab", 3)
2046 )
2047
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002048
Thomas Wouters89f507f2006-12-13 04:49:30 +00002049class WithStmtTest(unittest.TestCase):
2050 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002051 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002052 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2053 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002054
2055 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002056 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002057 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002058 with codecs.StreamReaderWriter(f, info.streamreader,
2059 info.streamwriter, 'strict') as srw:
2060 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002061
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002062class TypesTest(unittest.TestCase):
2063 def test_decode_unicode(self):
2064 # Most decoders don't accept unicode input
2065 decoders = [
2066 codecs.utf_7_decode,
2067 codecs.utf_8_decode,
2068 codecs.utf_16_le_decode,
2069 codecs.utf_16_be_decode,
2070 codecs.utf_16_ex_decode,
2071 codecs.utf_32_decode,
2072 codecs.utf_32_le_decode,
2073 codecs.utf_32_be_decode,
2074 codecs.utf_32_ex_decode,
2075 codecs.latin_1_decode,
2076 codecs.ascii_decode,
2077 codecs.charmap_decode,
2078 ]
2079 if hasattr(codecs, "mbcs_decode"):
2080 decoders.append(codecs.mbcs_decode)
2081 for decoder in decoders:
2082 self.assertRaises(TypeError, decoder, "xxx")
2083
2084 def test_unicode_escape(self):
2085 # Escape-decoding an unicode string is supported ang gives the same
2086 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002087 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2088 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2089 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2090 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002091
Victor Stinnere3b47152011-12-09 20:49:49 +01002092 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2093 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2094
2095 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2096 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2097
Serhiy Storchakad6793772013-01-29 10:20:44 +02002098
2099class UnicodeEscapeTest(unittest.TestCase):
2100 def test_empty(self):
2101 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2102 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2103
2104 def test_raw_encode(self):
2105 encode = codecs.unicode_escape_encode
2106 for b in range(32, 127):
2107 if b != b'\\'[0]:
2108 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2109
2110 def test_raw_decode(self):
2111 decode = codecs.unicode_escape_decode
2112 for b in range(256):
2113 if b != b'\\'[0]:
2114 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2115
2116 def test_escape_encode(self):
2117 encode = codecs.unicode_escape_encode
2118 check = coding_checker(self, encode)
2119 check('\t', br'\t')
2120 check('\n', br'\n')
2121 check('\r', br'\r')
2122 check('\\', br'\\')
2123 for b in range(32):
2124 if chr(b) not in '\t\n\r':
2125 check(chr(b), ('\\x%02x' % b).encode())
2126 for b in range(127, 256):
2127 check(chr(b), ('\\x%02x' % b).encode())
2128 check('\u20ac', br'\u20ac')
2129 check('\U0001d120', br'\U0001d120')
2130
2131 def test_escape_decode(self):
2132 decode = codecs.unicode_escape_decode
2133 check = coding_checker(self, decode)
2134 check(b"[\\\n]", "[]")
2135 check(br'[\"]', '["]')
2136 check(br"[\']", "[']")
2137 check(br"[\\]", r"[\]")
2138 check(br"[\a]", "[\x07]")
2139 check(br"[\b]", "[\x08]")
2140 check(br"[\t]", "[\x09]")
2141 check(br"[\n]", "[\x0a]")
2142 check(br"[\v]", "[\x0b]")
2143 check(br"[\f]", "[\x0c]")
2144 check(br"[\r]", "[\x0d]")
2145 check(br"[\7]", "[\x07]")
2146 check(br"[\8]", r"[\8]")
2147 check(br"[\78]", "[\x078]")
2148 check(br"[\41]", "[!]")
2149 check(br"[\418]", "[!8]")
2150 check(br"[\101]", "[A]")
2151 check(br"[\1010]", "[A0]")
2152 check(br"[\x41]", "[A]")
2153 check(br"[\x410]", "[A0]")
2154 check(br"\u20ac", "\u20ac")
2155 check(br"\U0001d120", "\U0001d120")
2156 for b in range(256):
2157 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2158 check(b'\\' + bytes([b]), '\\' + chr(b))
2159
2160 def test_decode_errors(self):
2161 decode = codecs.unicode_escape_decode
2162 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2163 for i in range(d):
2164 self.assertRaises(UnicodeDecodeError, decode,
2165 b"\\" + c + b"0"*i)
2166 self.assertRaises(UnicodeDecodeError, decode,
2167 b"[\\" + c + b"0"*i + b"]")
2168 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2169 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2170 self.assertEqual(decode(data, "replace"),
2171 ("[\ufffd]\ufffd", len(data)))
2172 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2173 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2174 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2175
2176
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002177class RawUnicodeEscapeTest(unittest.TestCase):
2178 def test_empty(self):
2179 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2180 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2181
2182 def test_raw_encode(self):
2183 encode = codecs.raw_unicode_escape_encode
2184 for b in range(256):
2185 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2186
2187 def test_raw_decode(self):
2188 decode = codecs.raw_unicode_escape_decode
2189 for b in range(256):
2190 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2191
2192 def test_escape_encode(self):
2193 encode = codecs.raw_unicode_escape_encode
2194 check = coding_checker(self, encode)
2195 for b in range(256):
2196 if b not in b'uU':
2197 check('\\' + chr(b), b'\\' + bytes([b]))
2198 check('\u20ac', br'\u20ac')
2199 check('\U0001d120', br'\U0001d120')
2200
2201 def test_escape_decode(self):
2202 decode = codecs.raw_unicode_escape_decode
2203 check = coding_checker(self, decode)
2204 for b in range(256):
2205 if b not in b'uU':
2206 check(b'\\' + bytes([b]), '\\' + chr(b))
2207 check(br"\u20ac", "\u20ac")
2208 check(br"\U0001d120", "\U0001d120")
2209
2210 def test_decode_errors(self):
2211 decode = codecs.raw_unicode_escape_decode
2212 for c, d in (b'u', 4), (b'U', 4):
2213 for i in range(d):
2214 self.assertRaises(UnicodeDecodeError, decode,
2215 b"\\" + c + b"0"*i)
2216 self.assertRaises(UnicodeDecodeError, decode,
2217 b"[\\" + c + b"0"*i + b"]")
2218 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2219 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2220 self.assertEqual(decode(data, "replace"),
2221 ("[\ufffd]\ufffd", len(data)))
2222 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2223 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2224 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2225
2226
Martin v. Löwis43c57782009-05-10 08:15:24 +00002227class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002228
2229 def test_utf8(self):
2230 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002231 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002232 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002233 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002234 b"foo\x80bar")
2235 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002236 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002237 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002238 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002239 b"\xed\xb0\x80")
2240
2241 def test_ascii(self):
2242 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002243 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002244 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002245 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002246 b"foo\x80bar")
2247
2248 def test_charmap(self):
2249 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002250 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002251 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002252 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002253 b"foo\xa5bar")
2254
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002255 def test_latin1(self):
2256 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002257 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002258 b"\xe4\xeb\xef\xf6\xfc")
2259
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002260
Victor Stinner3fed0872010-05-22 02:16:27 +00002261class BomTest(unittest.TestCase):
2262 def test_seek0(self):
2263 data = "1234567890"
2264 tests = ("utf-16",
2265 "utf-16-le",
2266 "utf-16-be",
2267 "utf-32",
2268 "utf-32-le",
2269 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002270 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002271 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002272 # Check if the BOM is written only once
2273 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002274 f.write(data)
2275 f.write(data)
2276 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002277 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002278 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002279 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002280
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002281 # Check that the BOM is written after a seek(0)
2282 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2283 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002284 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002285 f.seek(0)
2286 f.write(data)
2287 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002288 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002289
2290 # (StreamWriter) Check that the BOM is written after a seek(0)
2291 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002292 f.writer.write(data[0])
2293 self.assertNotEqual(f.writer.tell(), 0)
2294 f.writer.seek(0)
2295 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002296 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002297 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002298
Victor Stinner05010702011-05-27 16:50:40 +02002299 # Check that the BOM is not written after a seek() at a position
2300 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002301 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2302 f.write(data)
2303 f.seek(f.tell())
2304 f.write(data)
2305 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002306 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002307
Victor Stinner05010702011-05-27 16:50:40 +02002308 # (StreamWriter) Check that the BOM is not written after a seek()
2309 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002310 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002311 f.writer.write(data)
2312 f.writer.seek(f.writer.tell())
2313 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002314 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002315 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002316
Victor Stinner3fed0872010-05-22 02:16:27 +00002317
Georg Brandl02524622010-12-02 18:06:51 +00002318bytes_transform_encodings = [
2319 "base64_codec",
2320 "uu_codec",
2321 "quopri_codec",
2322 "hex_codec",
2323]
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002324
2325transform_aliases = {
2326 "base64_codec": ["base64", "base_64"],
2327 "uu_codec": ["uu"],
2328 "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2329 "hex_codec": ["hex"],
2330 "rot_13": ["rot13"],
2331}
2332
Georg Brandl02524622010-12-02 18:06:51 +00002333try:
2334 import zlib
2335except ImportError:
2336 pass
2337else:
2338 bytes_transform_encodings.append("zlib_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002339 transform_aliases["zlib_codec"] = ["zip", "zlib"]
Georg Brandl02524622010-12-02 18:06:51 +00002340try:
2341 import bz2
2342except ImportError:
2343 pass
2344else:
2345 bytes_transform_encodings.append("bz2_codec")
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002346 transform_aliases["bz2_codec"] = ["bz2"]
Georg Brandl02524622010-12-02 18:06:51 +00002347
2348class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002349
Georg Brandl02524622010-12-02 18:06:51 +00002350 def test_basics(self):
2351 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002352 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002353 with self.subTest(encoding=encoding):
2354 # generic codecs interface
2355 (o, size) = codecs.getencoder(encoding)(binput)
2356 self.assertEqual(size, len(binput))
2357 (i, size) = codecs.getdecoder(encoding)(o)
2358 self.assertEqual(size, len(o))
2359 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002360
Georg Brandl02524622010-12-02 18:06:51 +00002361 def test_read(self):
2362 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002363 with self.subTest(encoding=encoding):
2364 sin = codecs.encode(b"\x80", encoding)
2365 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2366 sout = reader.read()
2367 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002368
2369 def test_readline(self):
2370 for encoding in bytes_transform_encodings:
2371 if encoding in ['uu_codec', 'zlib_codec']:
2372 continue
Nick Coghlan8b097b42013-11-13 23:49:21 +10002373 with self.subTest(encoding=encoding):
2374 sin = codecs.encode(b"\x80", encoding)
2375 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2376 sout = reader.readline()
2377 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002378
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002379 def test_buffer_api_usage(self):
2380 # We check all the transform codecs accept memoryview input
2381 # for encoding and decoding
2382 # and also that they roundtrip correctly
2383 original = b"12345\x80"
2384 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002385 with self.subTest(encoding=encoding):
2386 data = original
2387 view = memoryview(data)
2388 data = codecs.encode(data, encoding)
2389 view_encoded = codecs.encode(view, encoding)
2390 self.assertEqual(view_encoded, data)
2391 view = memoryview(data)
2392 data = codecs.decode(data, encoding)
2393 self.assertEqual(data, original)
2394 view_decoded = codecs.decode(view, encoding)
2395 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002396
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002397 def test_text_to_binary_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002398 # Check binary -> binary codecs give a good error for str input
2399 bad_input = "bad input type"
2400 for encoding in bytes_transform_encodings:
2401 with self.subTest(encoding=encoding):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002402 fmt = ( "{!r} is not a text encoding; "
2403 "use codecs.encode\(\) to handle arbitrary codecs")
2404 msg = fmt.format(encoding)
2405 with self.assertRaisesRegex(LookupError, msg) as failure:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002406 bad_input.encode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002407 self.assertIsNone(failure.exception.__cause__)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002408
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002409 def test_text_to_binary_blacklists_text_transforms(self):
2410 # Check str.encode gives a good error message for str -> str codecs
2411 msg = (r"^'rot_13' is not a text encoding; "
2412 "use codecs.encode\(\) to handle arbitrary codecs")
2413 with self.assertRaisesRegex(LookupError, msg):
2414 "just an example message".encode("rot_13")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002415
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002416 def test_binary_to_text_blacklists_binary_transforms(self):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002417 # Check bytes.decode and bytearray.decode give a good error
2418 # message for binary -> binary codecs
2419 data = b"encode first to ensure we meet any format restrictions"
2420 for encoding in bytes_transform_encodings:
2421 with self.subTest(encoding=encoding):
2422 encoded_data = codecs.encode(data, encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002423 fmt = (r"{!r} is not a text encoding; "
2424 "use codecs.decode\(\) to handle arbitrary codecs")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002425 msg = fmt.format(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002426 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002427 encoded_data.decode(encoding)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002428 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002429 bytearray(encoded_data).decode(encoding)
2430
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002431 def test_binary_to_text_blacklists_text_transforms(self):
2432 # Check str -> str codec gives a good error for binary input
2433 for bad_input in (b"immutable", bytearray(b"mutable")):
2434 with self.subTest(bad_input=bad_input):
2435 msg = (r"^'rot_13' is not a text encoding; "
2436 "use codecs.decode\(\) to handle arbitrary codecs")
2437 with self.assertRaisesRegex(LookupError, msg) as failure:
2438 bad_input.decode("rot_13")
2439 self.assertIsNone(failure.exception.__cause__)
2440
2441 def test_custom_zlib_error_is_wrapped(self):
2442 # Check zlib codec gives a good error for malformed input
2443 msg = "^decoding with 'zlib_codec' codec failed"
2444 with self.assertRaisesRegex(Exception, msg) as failure:
2445 codecs.decode(b"hello", "zlib_codec")
2446 self.assertIsInstance(failure.exception.__cause__,
2447 type(failure.exception))
2448
2449 def test_custom_hex_error_is_wrapped(self):
2450 # Check hex codec gives a good error for malformed input
2451 msg = "^decoding with 'hex_codec' codec failed"
2452 with self.assertRaisesRegex(Exception, msg) as failure:
2453 codecs.decode(b"hello", "hex_codec")
2454 self.assertIsInstance(failure.exception.__cause__,
2455 type(failure.exception))
2456
2457 # Unfortunately, the bz2 module throws OSError, which the codec
2458 # machinery currently can't wrap :(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002459
Nick Coghlan9c1aed82013-11-23 11:13:36 +10002460 # Ensure codec aliases from http://bugs.python.org/issue7475 work
2461 def test_aliases(self):
2462 for codec_name, aliases in transform_aliases.items():
2463 expected_name = codecs.lookup(codec_name).name
2464 for alias in aliases:
2465 with self.subTest(alias=alias):
2466 info = codecs.lookup(alias)
2467 self.assertEqual(info.name, expected_name)
2468
Nick Coghlan8b097b42013-11-13 23:49:21 +10002469
2470# The codec system tries to wrap exceptions in order to ensure the error
2471# mentions the operation being performed and the codec involved. We
2472# currently *only* want this to happen for relatively stateless
2473# exceptions, where the only significant information they contain is their
2474# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002475
2476# Use a local codec registry to avoid appearing to leak objects when
2477# registering multiple seach functions
2478_TEST_CODECS = {}
2479
2480def _get_test_codec(codec_name):
2481 return _TEST_CODECS.get(codec_name)
2482codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2483
Nick Coghlan8b097b42013-11-13 23:49:21 +10002484class ExceptionChainingTest(unittest.TestCase):
2485
2486 def setUp(self):
2487 # There's no way to unregister a codec search function, so we just
2488 # ensure we render this one fairly harmless after the test
2489 # case finishes by using the test case repr as the codec name
2490 # The codecs module normalizes codec names, although this doesn't
2491 # appear to be formally documented...
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002492 # We also make sure we use a truly unique id for the custom codec
2493 # to avoid issues with the codec cache when running these tests
2494 # multiple times (e.g. when hunting for refleaks)
2495 unique_id = repr(self) + str(id(self))
2496 self.codec_name = encodings.normalize_encoding(unique_id).lower()
2497
2498 # We store the object to raise on the instance because of a bad
2499 # interaction between the codec caching (which means we can't
2500 # recreate the codec entry) and regrtest refleak hunting (which
2501 # runs the same test instance multiple times). This means we
2502 # need to ensure the codecs call back in to the instance to find
2503 # out which exception to raise rather than binding them in a
2504 # closure to an object that may change on the next run
2505 self.obj_to_raise = RuntimeError
Nick Coghlan8b097b42013-11-13 23:49:21 +10002506
Nick Coghlan4e553e22013-11-16 00:35:34 +10002507 def tearDown(self):
2508 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002509
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002510 def set_codec(self, encode, decode):
2511 codec_info = codecs.CodecInfo(encode, decode,
Nick Coghlan4e553e22013-11-16 00:35:34 +10002512 name=self.codec_name)
2513 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002514
2515 @contextlib.contextmanager
2516 def assertWrapped(self, operation, exc_type, msg):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002517 full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
Nick Coghlan8b097b42013-11-13 23:49:21 +10002518 operation, self.codec_name, exc_type.__name__, msg)
2519 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2520 yield caught
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002521 self.assertIsInstance(caught.exception.__cause__, exc_type)
2522
2523 def raise_obj(self, *args, **kwds):
2524 # Helper to dynamically change the object raised by a test codec
2525 raise self.obj_to_raise
Nick Coghlan8b097b42013-11-13 23:49:21 +10002526
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002527 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002528 self.obj_to_raise = obj_to_raise
2529 self.set_codec(self.raise_obj, self.raise_obj)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002530 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002531 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002532 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002533 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002534 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002535 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002536 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002537 codecs.decode(b"bytes input", self.codec_name)
2538
2539 def test_raise_by_type(self):
2540 self.check_wrapped(RuntimeError, "")
2541
2542 def test_raise_by_value(self):
2543 msg = "This should be wrapped"
2544 self.check_wrapped(RuntimeError(msg), msg)
2545
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002546 def test_raise_grandchild_subclass_exact_size(self):
2547 msg = "This should be wrapped"
2548 class MyRuntimeError(RuntimeError):
2549 __slots__ = ()
2550 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2551
2552 def test_raise_subclass_with_weakref_support(self):
2553 msg = "This should be wrapped"
2554 class MyRuntimeError(RuntimeError):
2555 pass
2556 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2557
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002558 def check_not_wrapped(self, obj_to_raise, msg):
2559 def raise_obj(*args, **kwds):
2560 raise obj_to_raise
2561 self.set_codec(raise_obj, raise_obj)
2562 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002563 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002564 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002565 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002566 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002567 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002568 with self.assertRaisesRegex(RuntimeError, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002569 codecs.decode(b"bytes input", self.codec_name)
2570
2571 def test_init_override_is_not_wrapped(self):
2572 class CustomInit(RuntimeError):
2573 def __init__(self):
2574 pass
2575 self.check_not_wrapped(CustomInit, "")
2576
2577 def test_new_override_is_not_wrapped(self):
2578 class CustomNew(RuntimeError):
2579 def __new__(cls):
2580 return super().__new__(cls)
2581 self.check_not_wrapped(CustomNew, "")
2582
2583 def test_instance_attribute_is_not_wrapped(self):
2584 msg = "This should NOT be wrapped"
2585 exc = RuntimeError(msg)
2586 exc.attr = 1
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002587 self.check_not_wrapped(exc, "^{}$".format(msg))
Nick Coghlan8b097b42013-11-13 23:49:21 +10002588
2589 def test_non_str_arg_is_not_wrapped(self):
2590 self.check_not_wrapped(RuntimeError(1), "1")
2591
2592 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002593 msg_re = r"^\('a', 'b', 'c'\)$"
2594 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002595
2596 # http://bugs.python.org/issue19609
2597 def test_codec_lookup_failure_not_wrapped(self):
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002598 msg = "^unknown encoding: {}$".format(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002599 # The initial codec lookup should not be wrapped
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002600 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002601 "str input".encode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002602 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002603 codecs.encode("str input", self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002604 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002605 b"bytes input".decode(self.codec_name)
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002606 with self.assertRaisesRegex(LookupError, msg):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002607 codecs.decode(b"bytes input", self.codec_name)
2608
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002609 def test_unflagged_non_text_codec_handling(self):
2610 # The stdlib non-text codecs are now marked so they're
2611 # pre-emptively skipped by the text model related methods
2612 # However, third party codecs won't be flagged, so we still make
2613 # sure the case where an inappropriate output type is produced is
2614 # handled appropriately
2615 def encode_to_str(*args, **kwds):
2616 return "not bytes!", 0
2617 def decode_to_bytes(*args, **kwds):
2618 return b"not str!", 0
2619 self.set_codec(encode_to_str, decode_to_bytes)
2620 # No input or output type checks on the codecs module functions
2621 encoded = codecs.encode(None, self.codec_name)
2622 self.assertEqual(encoded, "not bytes!")
2623 decoded = codecs.decode(None, self.codec_name)
2624 self.assertEqual(decoded, b"not str!")
2625 # Text model methods should complain
2626 fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2627 "use codecs.encode\(\) to encode to arbitrary types$")
2628 msg = fmt.format(self.codec_name)
2629 with self.assertRaisesRegex(TypeError, msg):
2630 "str_input".encode(self.codec_name)
2631 fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2632 "use codecs.decode\(\) to decode to arbitrary types$")
2633 msg = fmt.format(self.codec_name)
2634 with self.assertRaisesRegex(TypeError, msg):
2635 b"bytes input".decode(self.codec_name)
2636
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002637
Georg Brandl02524622010-12-02 18:06:51 +00002638
Victor Stinner62be4fb2011-10-18 21:46:37 +02002639@unittest.skipUnless(sys.platform == 'win32',
2640 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002641class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002642 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002643 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002644
Victor Stinner3a50e702011-10-18 21:21:00 +02002645 def test_invalid_code_page(self):
2646 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2647 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002648 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2649 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002650
2651 def test_code_page_name(self):
2652 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2653 codecs.code_page_encode, 932, '\xff')
2654 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2655 codecs.code_page_decode, 932, b'\x81\x00')
2656 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2657 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2658
2659 def check_decode(self, cp, tests):
2660 for raw, errors, expected in tests:
2661 if expected is not None:
2662 try:
2663 decoded = codecs.code_page_decode(cp, raw, errors)
2664 except UnicodeDecodeError as err:
2665 self.fail('Unable to decode %a from "cp%s" with '
2666 'errors=%r: %s' % (raw, cp, errors, err))
2667 self.assertEqual(decoded[0], expected,
2668 '%a.decode("cp%s", %r)=%a != %a'
2669 % (raw, cp, errors, decoded[0], expected))
2670 # assert 0 <= decoded[1] <= len(raw)
2671 self.assertGreaterEqual(decoded[1], 0)
2672 self.assertLessEqual(decoded[1], len(raw))
2673 else:
2674 self.assertRaises(UnicodeDecodeError,
2675 codecs.code_page_decode, cp, raw, errors)
2676
2677 def check_encode(self, cp, tests):
2678 for text, errors, expected in tests:
2679 if expected is not None:
2680 try:
2681 encoded = codecs.code_page_encode(cp, text, errors)
2682 except UnicodeEncodeError as err:
2683 self.fail('Unable to encode %a to "cp%s" with '
2684 'errors=%r: %s' % (text, cp, errors, err))
2685 self.assertEqual(encoded[0], expected,
2686 '%a.encode("cp%s", %r)=%a != %a'
2687 % (text, cp, errors, encoded[0], expected))
2688 self.assertEqual(encoded[1], len(text))
2689 else:
2690 self.assertRaises(UnicodeEncodeError,
2691 codecs.code_page_encode, cp, text, errors)
2692
2693 def test_cp932(self):
2694 self.check_encode(932, (
2695 ('abc', 'strict', b'abc'),
2696 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002697 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002698 ('\xff', 'strict', None),
2699 ('[\xff]', 'ignore', b'[]'),
2700 ('[\xff]', 'replace', b'[y]'),
2701 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002702 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2703 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002704 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002705 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002706 (b'abc', 'strict', 'abc'),
2707 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2708 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002709 (b'[\xff]', 'strict', None),
2710 (b'[\xff]', 'ignore', '[]'),
2711 (b'[\xff]', 'replace', '[\ufffd]'),
2712 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002713 (b'\x81\x00abc', 'strict', None),
2714 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002715 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2716 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002717
2718 def test_cp1252(self):
2719 self.check_encode(1252, (
2720 ('abc', 'strict', b'abc'),
2721 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2722 ('\xff', 'strict', b'\xff'),
2723 ('\u0141', 'strict', None),
2724 ('\u0141', 'ignore', b''),
2725 ('\u0141', 'replace', b'L'),
2726 ))
2727 self.check_decode(1252, (
2728 (b'abc', 'strict', 'abc'),
2729 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2730 (b'\xff', 'strict', '\xff'),
2731 ))
2732
2733 def test_cp_utf7(self):
2734 cp = 65000
2735 self.check_encode(cp, (
2736 ('abc', 'strict', b'abc'),
2737 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2738 ('\U0010ffff', 'strict', b'+2//f/w-'),
2739 ('\udc80', 'strict', b'+3IA-'),
2740 ('\ufffd', 'strict', b'+//0-'),
2741 ))
2742 self.check_decode(cp, (
2743 (b'abc', 'strict', 'abc'),
2744 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2745 (b'+2//f/w-', 'strict', '\U0010ffff'),
2746 (b'+3IA-', 'strict', '\udc80'),
2747 (b'+//0-', 'strict', '\ufffd'),
2748 # invalid bytes
2749 (b'[+/]', 'strict', '[]'),
2750 (b'[\xff]', 'strict', '[\xff]'),
2751 ))
2752
Victor Stinner3a50e702011-10-18 21:21:00 +02002753 def test_multibyte_encoding(self):
2754 self.check_decode(932, (
2755 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2756 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2757 ))
2758 self.check_decode(self.CP_UTF8, (
2759 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2760 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2761 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002762 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002763 self.check_encode(self.CP_UTF8, (
2764 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2765 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2766 ))
2767
2768 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002769 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2770 self.assertEqual(decoded, ('', 0))
2771
Victor Stinner3a50e702011-10-18 21:21:00 +02002772 decoded = codecs.code_page_decode(932,
2773 b'\xe9\x80\xe9', 'strict',
2774 False)
2775 self.assertEqual(decoded, ('\u9a3e', 2))
2776
2777 decoded = codecs.code_page_decode(932,
2778 b'\xe9\x80\xe9\x80', 'strict',
2779 False)
2780 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2781
2782 decoded = codecs.code_page_decode(932,
2783 b'abc', 'strict',
2784 False)
2785 self.assertEqual(decoded, ('abc', 3))
2786
2787
Fred Drake2e2be372001-09-20 21:33:42 +00002788if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002789 unittest.main()