blob: 728f7d006b71ab3e64852470004eae13a55d0028 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Nick Coghlan8b097b42013-11-13 23:49:21 +10003import contextlib
Victor Stinner040e16e2011-11-15 22:44:05 +01004import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02005import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01006import sys
7import unittest
8import warnings
9
10from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020011
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020012if sys.platform == 'win32':
13 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
14else:
15 VISTA_OR_LATER = False
16
Antoine Pitrou00b2c862011-10-05 13:01:41 +020017try:
18 import ctypes
19except ImportError:
20 ctypes = None
21 SIZEOF_WCHAR_T = -1
22else:
23 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000024
Serhiy Storchakad6793772013-01-29 10:20:44 +020025def coding_checker(self, coder):
26 def check(input, expect):
27 self.assertEqual(coder(input), (expect, len(input)))
28 return check
29
Walter Dörwald69652032004-09-07 20:24:22 +000030class Queue(object):
31 """
32 queue: write bytes at one end, read bytes from the other end
33 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000034 def __init__(self, buffer):
35 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000036
37 def write(self, chars):
38 self._buffer += chars
39
40 def read(self, size=-1):
41 if size<0:
42 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000044 return s
45 else:
46 s = self._buffer[:size]
47 self._buffer = self._buffer[size:]
48 return s
49
Walter Dörwald3abcb012007-04-16 22:10:50 +000050class MixInCheckStateHandling:
51 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000052 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000053 d = codecs.getincrementaldecoder(encoding)()
54 part1 = d.decode(s[:i])
55 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000056 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 # Check that the condition stated in the documentation for
58 # IncrementalDecoder.getstate() holds
59 if not state[1]:
60 # reset decoder to the default state without anything buffered
61 d.setstate((state[0][:0], 0))
62 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000063 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000064 # The decoder must return to the same state
65 self.assertEqual(state, d.getstate())
66 # Create a new decoder and set it to the state
67 # we extracted from the old one
68 d = codecs.getincrementaldecoder(encoding)()
69 d.setstate(state)
70 part2 = d.decode(s[i:], True)
71 self.assertEqual(u, part1+part2)
72
73 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000074 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000075 d = codecs.getincrementalencoder(encoding)()
76 part1 = d.encode(u[:i])
77 state = d.getstate()
78 d = codecs.getincrementalencoder(encoding)()
79 d.setstate(state)
80 part2 = d.encode(u[i:], True)
81 self.assertEqual(s, part1+part2)
82
Ezio Melotti5d3dba02013-01-11 06:02:07 +020083class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000084 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000085 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000086 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000087 # the StreamReader and check that the results equal the appropriate
88 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020090 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000092 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000093 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000094 result += r.read()
95 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000099
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 # do the check again, this time using a incremental decoder
101 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000110 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000112 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 self.assertEqual(result, partialresult)
116 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000117 self.assertEqual(d.decode(b"", True), "")
118 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000119
120 # check iterdecode()
121 encoded = input.encode(self.encoding)
122 self.assertEqual(
123 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000124 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000125 )
126
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 def test_readline(self):
128 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000129 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000130 return codecs.getreader(self.encoding)(stream)
131
Walter Dörwaldca199432006-03-06 22:39:12 +0000132 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200133 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134 lines = []
135 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000136 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000137 if not line:
138 break
139 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000140 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
143 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
144 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000145 self.assertEqual(readalllines(s, True), sexpected)
146 self.assertEqual(readalllines(s, False), sexpectednoends)
147 self.assertEqual(readalllines(s, True, 10), sexpected)
148 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149
150 # Test long lines (multiple calls to read() in readline())
151 vw = []
152 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000153 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
154 vw.append((i*200)*"\3042" + lineend)
155 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000156 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
157 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
158
159 # Test lines where the first read might end with \r, so the
160 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000161 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 for lineend in "\n \r\n \r \u2028".split():
163 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000171 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000172 self.assertEqual(
173 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000174 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 )
176
177 def test_bug1175396(self):
178 s = [
179 '<%!--===================================================\r\n',
180 ' BLOG index page: show recent articles,\r\n',
181 ' today\'s articles, or articles of a specific date.\r\n',
182 '========================================================--%>\r\n',
183 '<%@inputencoding="ISO-8859-1"%>\r\n',
184 '<%@pagetemplate=TEMPLATE.y%>\r\n',
185 '<%@import=import frog.util, frog%>\r\n',
186 '<%@import=import frog.objects%>\r\n',
187 '<%@import=from frog.storageerrors import StorageError%>\r\n',
188 '<%\r\n',
189 '\r\n',
190 'import logging\r\n',
191 'log=logging.getLogger("Snakelets.logger")\r\n',
192 '\r\n',
193 '\r\n',
194 'user=self.SessionCtx.user\r\n',
195 'storageEngine=self.SessionCtx.storageEngine\r\n',
196 '\r\n',
197 '\r\n',
198 'def readArticlesFromDate(date, count=None):\r\n',
199 ' entryids=storageEngine.listBlogEntries(date)\r\n',
200 ' entryids.reverse() # descending\r\n',
201 ' if count:\r\n',
202 ' entryids=entryids[:count]\r\n',
203 ' try:\r\n',
204 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
205 ' except StorageError,x:\r\n',
206 ' log.error("Error loading articles: "+str(x))\r\n',
207 ' self.abort("cannot load articles")\r\n',
208 '\r\n',
209 'showdate=None\r\n',
210 '\r\n',
211 'arg=self.Request.getArg()\r\n',
212 'if arg=="today":\r\n',
213 ' #-------------------- TODAY\'S ARTICLES\r\n',
214 ' self.write("<h2>Today\'s articles</h2>")\r\n',
215 ' showdate = frog.util.isodatestr() \r\n',
216 ' entries = readArticlesFromDate(showdate)\r\n',
217 'elif arg=="active":\r\n',
218 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
219 ' self.Yredirect("active.y")\r\n',
220 'elif arg=="login":\r\n',
221 ' #-------------------- LOGIN PAGE redirect\r\n',
222 ' self.Yredirect("login.y")\r\n',
223 'elif arg=="date":\r\n',
224 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
225 ' showdate = self.Request.getParameter("date")\r\n',
226 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
227 ' entries = readArticlesFromDate(showdate)\r\n',
228 'else:\r\n',
229 ' #-------------------- RECENT ARTICLES\r\n',
230 ' self.write("<h2>Recent articles</h2>")\r\n',
231 ' dates=storageEngine.listBlogEntryDates()\r\n',
232 ' if dates:\r\n',
233 ' entries=[]\r\n',
234 ' SHOWAMOUNT=10\r\n',
235 ' for showdate in dates:\r\n',
236 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
237 ' if len(entries)>=SHOWAMOUNT:\r\n',
238 ' break\r\n',
239 ' \r\n',
240 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000241 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200242 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000243 for (i, line) in enumerate(reader):
244 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000245
246 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000247 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200248 writer = codecs.getwriter(self.encoding)(q)
249 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000250
251 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000252 writer.write("foo\r")
253 self.assertEqual(reader.readline(keepends=False), "foo")
254 writer.write("\nbar\r")
255 self.assertEqual(reader.readline(keepends=False), "")
256 self.assertEqual(reader.readline(keepends=False), "bar")
257 writer.write("baz")
258 self.assertEqual(reader.readline(keepends=False), "baz")
259 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000260
261 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000262 writer.write("foo\r")
263 self.assertEqual(reader.readline(keepends=True), "foo\r")
264 writer.write("\nbar\r")
265 self.assertEqual(reader.readline(keepends=True), "\n")
266 self.assertEqual(reader.readline(keepends=True), "bar\r")
267 writer.write("baz")
268 self.assertEqual(reader.readline(keepends=True), "baz")
269 self.assertEqual(reader.readline(keepends=True), "")
270 writer.write("foo\r\n")
271 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000272
Walter Dörwald9fa09462005-01-10 12:01:39 +0000273 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000274 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
275 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
276 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000277
278 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000279 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200280 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000281 self.assertEqual(reader.readline(), s1)
282 self.assertEqual(reader.readline(), s2)
283 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000284 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000285
286 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000287 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
288 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
289 s3 = "stillokay:bbbbxx\r\n"
290 s4 = "broken!!!!badbad\r\n"
291 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000292
293 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000294 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200295 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296 self.assertEqual(reader.readline(), s1)
297 self.assertEqual(reader.readline(), s2)
298 self.assertEqual(reader.readline(), s3)
299 self.assertEqual(reader.readline(), s4)
300 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000301 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000302
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200303 ill_formed_sequence_replace = "\ufffd"
304
305 def test_lone_surrogates(self):
306 self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
307 self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
308 "[\\udc80]".encode(self.encoding))
309 self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
310 "[&#56448;]".encode(self.encoding))
311 self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
312 "[]".encode(self.encoding))
313 self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
314 "[?]".encode(self.encoding))
315
316 bom = "".encode(self.encoding)
317 for before, after in [("\U00010fff", "A"), ("[", "]"),
318 ("A", "\U00010fff")]:
319 before_sequence = before.encode(self.encoding)[len(bom):]
320 after_sequence = after.encode(self.encoding)[len(bom):]
321 test_string = before + "\uDC80" + after
322 test_sequence = (bom + before_sequence +
323 self.ill_formed_sequence + after_sequence)
324 self.assertRaises(UnicodeDecodeError, test_sequence.decode,
325 self.encoding)
326 self.assertEqual(test_string.encode(self.encoding,
327 "surrogatepass"),
328 test_sequence)
329 self.assertEqual(test_sequence.decode(self.encoding,
330 "surrogatepass"),
331 test_string)
332 self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
333 before + after)
334 self.assertEqual(test_sequence.decode(self.encoding, "replace"),
335 before + self.ill_formed_sequence_replace + after)
336
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200337class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000338 encoding = "utf-32"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200339 if sys.byteorder == 'little':
340 ill_formed_sequence = b"\x80\xdc\x00\x00"
341 else:
342 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000343
344 spamle = (b'\xff\xfe\x00\x00'
345 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
346 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
347 spambe = (b'\x00\x00\xfe\xff'
348 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
349 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
350
351 def test_only_one_bom(self):
352 _,_,reader,writer = codecs.lookup(self.encoding)
353 # encode some stream
354 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200355 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000356 f.write("spam")
357 f.write("spam")
358 d = s.getvalue()
359 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000360 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000361 # try to read it back
362 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200363 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000364 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000365
366 def test_badbom(self):
367 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200368 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000369 self.assertRaises(UnicodeError, f.read)
370
371 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200372 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000373 self.assertRaises(UnicodeError, f.read)
374
375 def test_partial(self):
376 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200377 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000378 [
379 "", # first byte of BOM read
380 "", # second byte of BOM read
381 "", # third byte of BOM read
382 "", # fourth byte of BOM read => byteorder known
383 "",
384 "",
385 "",
386 "\x00",
387 "\x00",
388 "\x00",
389 "\x00",
390 "\x00\xff",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff",
394 "\x00\xff\u0100",
395 "\x00\xff\u0100",
396 "\x00\xff\u0100",
397 "\x00\xff\u0100",
398 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200399 "\x00\xff\u0100\uffff",
400 "\x00\xff\u0100\uffff",
401 "\x00\xff\u0100\uffff",
402 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000403 ]
404 )
405
Georg Brandl791f4e12009-09-17 11:41:24 +0000406 def test_handlers(self):
407 self.assertEqual(('\ufffd', 1),
408 codecs.utf_32_decode(b'\x01', 'replace', True))
409 self.assertEqual(('', 1),
410 codecs.utf_32_decode(b'\x01', 'ignore', True))
411
Walter Dörwald41980ca2007-08-16 21:55:45 +0000412 def test_errors(self):
413 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
414 b"\xff", "strict", True)
415
416 def test_decoder_state(self):
417 self.check_state_handling_decode(self.encoding,
418 "spamspam", self.spamle)
419 self.check_state_handling_decode(self.encoding,
420 "spamspam", self.spambe)
421
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000422 def test_issue8941(self):
423 # Issue #8941: insufficient result allocation when decoding into
424 # surrogate pairs on UCS-2 builds.
425 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
426 self.assertEqual('\U00010000' * 1024,
427 codecs.utf_32_decode(encoded_le)[0])
428 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
429 self.assertEqual('\U00010000' * 1024,
430 codecs.utf_32_decode(encoded_be)[0])
431
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200432class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000433 encoding = "utf-32-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200434 ill_formed_sequence = b"\x80\xdc\x00\x00"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000435
436 def test_partial(self):
437 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200438 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000439 [
440 "",
441 "",
442 "",
443 "\x00",
444 "\x00",
445 "\x00",
446 "\x00",
447 "\x00\xff",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff",
451 "\x00\xff\u0100",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200456 "\x00\xff\u0100\uffff",
457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff",
459 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000460 ]
461 )
462
463 def test_simple(self):
464 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
465
466 def test_errors(self):
467 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
468 b"\xff", "strict", True)
469
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000470 def test_issue8941(self):
471 # Issue #8941: insufficient result allocation when decoding into
472 # surrogate pairs on UCS-2 builds.
473 encoded = b'\x00\x00\x01\x00' * 1024
474 self.assertEqual('\U00010000' * 1024,
475 codecs.utf_32_le_decode(encoded)[0])
476
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200477class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000478 encoding = "utf-32-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200479 ill_formed_sequence = b"\x00\x00\xdc\x80"
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480
481 def test_partial(self):
482 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200483 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000484 [
485 "",
486 "",
487 "",
488 "\x00",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00\xff",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff\u0100",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200501 "\x00\xff\u0100\uffff",
502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000505 ]
506 )
507
508 def test_simple(self):
509 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
510
511 def test_errors(self):
512 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
513 b"\xff", "strict", True)
514
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000515 def test_issue8941(self):
516 # Issue #8941: insufficient result allocation when decoding into
517 # surrogate pairs on UCS-2 builds.
518 encoded = b'\x00\x01\x00\x00' * 1024
519 self.assertEqual('\U00010000' * 1024,
520 codecs.utf_32_be_decode(encoded)[0])
521
522
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200523class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524 encoding = "utf-16"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200525 if sys.byteorder == 'little':
526 ill_formed_sequence = b"\x80\xdc"
527 else:
528 ill_formed_sequence = b"\xdc\x80"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000529
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000530 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
531 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000532
533 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000534 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000535 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000536 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200537 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000538 f.write("spam")
539 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000540 d = s.getvalue()
541 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000542 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000543 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000544 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200545 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000546 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000547
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000548 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000549 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200550 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000551 self.assertRaises(UnicodeError, f.read)
552
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000553 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200554 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000555 self.assertRaises(UnicodeError, f.read)
556
Walter Dörwald69652032004-09-07 20:24:22 +0000557 def test_partial(self):
558 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200559 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000560 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000561 "", # first byte of BOM read
562 "", # second byte of BOM read => byteorder known
563 "",
564 "\x00",
565 "\x00",
566 "\x00\xff",
567 "\x00\xff",
568 "\x00\xff\u0100",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200571 "\x00\xff\u0100\uffff",
572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 ]
576 )
577
Georg Brandl791f4e12009-09-17 11:41:24 +0000578 def test_handlers(self):
579 self.assertEqual(('\ufffd', 1),
580 codecs.utf_16_decode(b'\x01', 'replace', True))
581 self.assertEqual(('', 1),
582 codecs.utf_16_decode(b'\x01', 'ignore', True))
583
Walter Dörwalde22d3392005-11-17 08:52:34 +0000584 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000585 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000586 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000587
588 def test_decoder_state(self):
589 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000591 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000592 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000593
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000594 def test_bug691291(self):
595 # Files are always opened in binary mode, even if no binary mode was
596 # specified. This means that no automatic conversion of '\n' is done
597 # on reading and writing.
598 s1 = 'Hello\r\nworld\r\n'
599
600 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200601 self.addCleanup(support.unlink, support.TESTFN)
602 with open(support.TESTFN, 'wb') as fp:
603 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200604 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200605 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000606
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200607class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000608 encoding = "utf-16-le"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200609 ill_formed_sequence = b"\x80\xdc"
Walter Dörwald69652032004-09-07 20:24:22 +0000610
611 def test_partial(self):
612 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200613 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000614 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000615 "",
616 "\x00",
617 "\x00",
618 "\x00\xff",
619 "\x00\xff",
620 "\x00\xff\u0100",
621 "\x00\xff\u0100",
622 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200623 "\x00\xff\u0100\uffff",
624 "\x00\xff\u0100\uffff",
625 "\x00\xff\u0100\uffff",
626 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000627 ]
628 )
629
Walter Dörwalde22d3392005-11-17 08:52:34 +0000630 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200631 tests = [
632 (b'\xff', '\ufffd'),
633 (b'A\x00Z', 'A\ufffd'),
634 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
635 (b'\x00\xd8', '\ufffd'),
636 (b'\x00\xd8A', '\ufffd'),
637 (b'\x00\xd8A\x00', '\ufffdA'),
638 (b'\x00\xdcA\x00', '\ufffdA'),
639 ]
640 for raw, expected in tests:
641 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
642 raw, 'strict', True)
643 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000644
Victor Stinner53a9dd72010-12-08 22:25:45 +0000645 def test_nonbmp(self):
646 self.assertEqual("\U00010203".encode(self.encoding),
647 b'\x00\xd8\x03\xde')
648 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
649 "\U00010203")
650
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200651class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000652 encoding = "utf-16-be"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200653 ill_formed_sequence = b"\xdc\x80"
Walter Dörwald69652032004-09-07 20:24:22 +0000654
655 def test_partial(self):
656 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200657 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000658 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000659 "",
660 "\x00",
661 "\x00",
662 "\x00\xff",
663 "\x00\xff",
664 "\x00\xff\u0100",
665 "\x00\xff\u0100",
666 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200667 "\x00\xff\u0100\uffff",
668 "\x00\xff\u0100\uffff",
669 "\x00\xff\u0100\uffff",
670 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000671 ]
672 )
673
Walter Dörwalde22d3392005-11-17 08:52:34 +0000674 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200675 tests = [
676 (b'\xff', '\ufffd'),
677 (b'\x00A\xff', 'A\ufffd'),
678 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
679 (b'\xd8\x00', '\ufffd'),
680 (b'\xd8\x00\xdc', '\ufffd'),
681 (b'\xd8\x00\x00A', '\ufffdA'),
682 (b'\xdc\x00\x00A', '\ufffdA'),
683 ]
684 for raw, expected in tests:
685 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
686 raw, 'strict', True)
687 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000688
Victor Stinner53a9dd72010-12-08 22:25:45 +0000689 def test_nonbmp(self):
690 self.assertEqual("\U00010203".encode(self.encoding),
691 b'\xd8\x00\xde\x03')
692 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
693 "\U00010203")
694
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200695class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000696 encoding = "utf-8"
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200697 ill_formed_sequence = b"\xed\xb2\x80"
698 ill_formed_sequence_replace = "\ufffd" * 3
Walter Dörwald69652032004-09-07 20:24:22 +0000699
700 def test_partial(self):
701 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200702 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000703 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000704 "\x00",
705 "\x00",
706 "\x00\xff",
707 "\x00\xff",
708 "\x00\xff\u07ff",
709 "\x00\xff\u07ff",
710 "\x00\xff\u07ff",
711 "\x00\xff\u07ff\u0800",
712 "\x00\xff\u07ff\u0800",
713 "\x00\xff\u07ff\u0800",
714 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200715 "\x00\xff\u07ff\u0800\uffff",
716 "\x00\xff\u07ff\u0800\uffff",
717 "\x00\xff\u07ff\u0800\uffff",
718 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000719 ]
720 )
721
Walter Dörwald3abcb012007-04-16 22:10:50 +0000722 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000723 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000724 self.check_state_handling_decode(self.encoding,
725 u, u.encode(self.encoding))
726
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000727 def test_lone_surrogates(self):
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200728 super().test_lone_surrogates()
729 # not sure if this is making sense for
730 # UTF-16 and UTF-32
731 self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
Victor Stinner31be90b2010-04-22 19:38:16 +0000732 b'[\x80]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000733
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000734 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000735 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
736 b"abc\xed\xa0\x80def")
737 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
738 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200739 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
740 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
741 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
742 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000743 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700744 with self.assertRaises(UnicodeDecodeError):
745 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200746 with self.assertRaises(UnicodeDecodeError):
747 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000748
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200749@unittest.skipUnless(sys.platform == 'win32',
750 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200751class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200752 encoding = "cp65001"
753
754 def test_encode(self):
755 tests = [
756 ('abc', 'strict', b'abc'),
757 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
758 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
759 ]
760 if VISTA_OR_LATER:
761 tests.extend((
762 ('\udc80', 'strict', None),
763 ('\udc80', 'ignore', b''),
764 ('\udc80', 'replace', b'?'),
765 ('\udc80', 'backslashreplace', b'\\udc80'),
766 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
767 ))
768 else:
769 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
770 for text, errors, expected in tests:
771 if expected is not None:
772 try:
773 encoded = text.encode('cp65001', errors)
774 except UnicodeEncodeError as err:
775 self.fail('Unable to encode %a to cp65001 with '
776 'errors=%r: %s' % (text, errors, err))
777 self.assertEqual(encoded, expected,
778 '%a.encode("cp65001", %r)=%a != %a'
779 % (text, errors, encoded, expected))
780 else:
781 self.assertRaises(UnicodeEncodeError,
782 text.encode, "cp65001", errors)
783
784 def test_decode(self):
785 tests = [
786 (b'abc', 'strict', 'abc'),
787 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
788 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
789 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
790 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
791 # invalid bytes
792 (b'[\xff]', 'strict', None),
793 (b'[\xff]', 'ignore', '[]'),
794 (b'[\xff]', 'replace', '[\ufffd]'),
795 (b'[\xff]', 'surrogateescape', '[\udcff]'),
796 ]
797 if VISTA_OR_LATER:
798 tests.extend((
799 (b'[\xed\xb2\x80]', 'strict', None),
800 (b'[\xed\xb2\x80]', 'ignore', '[]'),
801 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
802 ))
803 else:
804 tests.extend((
805 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
806 ))
807 for raw, errors, expected in tests:
808 if expected is not None:
809 try:
810 decoded = raw.decode('cp65001', errors)
811 except UnicodeDecodeError as err:
812 self.fail('Unable to decode %a from cp65001 with '
813 'errors=%r: %s' % (raw, errors, err))
814 self.assertEqual(decoded, expected,
815 '%a.decode("cp65001", %r)=%a != %a'
816 % (raw, errors, decoded, expected))
817 else:
818 self.assertRaises(UnicodeDecodeError,
819 raw.decode, 'cp65001', errors)
820
821 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
822 def test_lone_surrogates(self):
823 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
824 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
825 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
826 b'[\\udc80]')
827 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
828 b'[&#56448;]')
829 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
830 b'[\x80]')
831 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
832 b'[]')
833 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
834 b'[?]')
835
836 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
837 def test_surrogatepass_handler(self):
838 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
839 b"abc\xed\xa0\x80def")
840 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
841 "abc\ud800def")
842 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
843 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
844 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
845 "\U00010fff\uD800")
846 self.assertTrue(codecs.lookup_error("surrogatepass"))
847
848
849
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200850class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000851 encoding = "utf-7"
852
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000853 def test_partial(self):
854 self.check_partial(
855 "a+-b",
856 [
857 "a",
858 "a",
859 "a+",
860 "a+-",
861 "a+-b",
862 ]
863 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000864
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300865 def test_errors(self):
866 tests = [
867 (b'a\xffb', 'a\ufffdb'),
868 (b'a+IK', 'a\ufffd'),
869 (b'a+IK-b', 'a\ufffdb'),
870 (b'a+IK,b', 'a\ufffdb'),
871 (b'a+IKx', 'a\u20ac\ufffd'),
872 (b'a+IKx-b', 'a\u20ac\ufffdb'),
873 (b'a+IKwgr', 'a\u20ac\ufffd'),
874 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
875 (b'a+IKwgr,', 'a\u20ac\ufffd'),
876 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
877 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
878 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
879 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
880 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
881 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
882 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
883 ]
884 for raw, expected in tests:
885 with self.subTest(raw=raw):
886 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
887 raw, 'strict', True)
888 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
889
890 def test_nonbmp(self):
891 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
892 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
893 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
894
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200895 test_lone_surrogates = None
896
897
Walter Dörwalde22d3392005-11-17 08:52:34 +0000898class UTF16ExTest(unittest.TestCase):
899
900 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000901 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000902
903 def test_bad_args(self):
904 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
905
906class ReadBufferTest(unittest.TestCase):
907
908 def test_array(self):
909 import array
910 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000911 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000912 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000913 )
914
915 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000916 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000917
918 def test_bad_args(self):
919 self.assertRaises(TypeError, codecs.readbuffer_encode)
920 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
921
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200922class UTF8SigTest(UTF8Test, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000923 encoding = "utf-8-sig"
924
925 def test_partial(self):
926 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200927 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000928 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000929 "",
930 "",
931 "", # First BOM has been read and skipped
932 "",
933 "",
934 "\ufeff", # Second BOM has been read and emitted
935 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000936 "\ufeff\x00", # First byte of encoded "\xff" read
937 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
938 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
939 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000940 "\ufeff\x00\xff\u07ff",
941 "\ufeff\x00\xff\u07ff",
942 "\ufeff\x00\xff\u07ff\u0800",
943 "\ufeff\x00\xff\u07ff\u0800",
944 "\ufeff\x00\xff\u07ff\u0800",
945 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200946 "\ufeff\x00\xff\u07ff\u0800\uffff",
947 "\ufeff\x00\xff\u07ff\u0800\uffff",
948 "\ufeff\x00\xff\u07ff\u0800\uffff",
949 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000950 ]
951 )
952
Thomas Wouters89f507f2006-12-13 04:49:30 +0000953 def test_bug1601501(self):
954 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000955 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000956
Walter Dörwald3abcb012007-04-16 22:10:50 +0000957 def test_bom(self):
958 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000959 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000960 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
961
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000962 def test_stream_bom(self):
963 unistring = "ABC\u00A1\u2200XYZ"
964 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
965
966 reader = codecs.getreader("utf-8-sig")
967 for sizehint in [None] + list(range(1, 11)) + \
968 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200969 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000970 ostream = io.StringIO()
971 while 1:
972 if sizehint is not None:
973 data = istream.read(sizehint)
974 else:
975 data = istream.read()
976
977 if not data:
978 break
979 ostream.write(data)
980
981 got = ostream.getvalue()
982 self.assertEqual(got, unistring)
983
984 def test_stream_bare(self):
985 unistring = "ABC\u00A1\u2200XYZ"
986 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
987
988 reader = codecs.getreader("utf-8-sig")
989 for sizehint in [None] + list(range(1, 11)) + \
990 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200991 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000992 ostream = io.StringIO()
993 while 1:
994 if sizehint is not None:
995 data = istream.read(sizehint)
996 else:
997 data = istream.read()
998
999 if not data:
1000 break
1001 ostream.write(data)
1002
1003 got = ostream.getvalue()
1004 self.assertEqual(got, unistring)
1005
1006class EscapeDecodeTest(unittest.TestCase):
1007 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001008 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001009
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001010 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001011 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001012 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001013 b = bytes([b])
1014 if b != b'\\':
1015 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001016
1017 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001018 decode = codecs.escape_decode
1019 check = coding_checker(self, decode)
1020 check(b"[\\\n]", b"[]")
1021 check(br'[\"]', b'["]')
1022 check(br"[\']", b"[']")
1023 check(br"[\\]", br"[\]")
1024 check(br"[\a]", b"[\x07]")
1025 check(br"[\b]", b"[\x08]")
1026 check(br"[\t]", b"[\x09]")
1027 check(br"[\n]", b"[\x0a]")
1028 check(br"[\v]", b"[\x0b]")
1029 check(br"[\f]", b"[\x0c]")
1030 check(br"[\r]", b"[\x0d]")
1031 check(br"[\7]", b"[\x07]")
1032 check(br"[\8]", br"[\8]")
1033 check(br"[\78]", b"[\x078]")
1034 check(br"[\41]", b"[!]")
1035 check(br"[\418]", b"[!8]")
1036 check(br"[\101]", b"[A]")
1037 check(br"[\1010]", b"[A0]")
1038 check(br"[\501]", b"[A]")
1039 check(br"[\x41]", b"[A]")
1040 check(br"[\X41]", br"[\X41]")
1041 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001042 for b in range(256):
1043 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001044 b = bytes([b])
1045 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001046
1047 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001048 decode = codecs.escape_decode
1049 self.assertRaises(ValueError, decode, br"\x")
1050 self.assertRaises(ValueError, decode, br"[\x]")
1051 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1052 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1053 self.assertRaises(ValueError, decode, br"\x0")
1054 self.assertRaises(ValueError, decode, br"[\x0]")
1055 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1056 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001057
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001058class RecodingTest(unittest.TestCase):
1059 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001060 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001061 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001062 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001063 f2.close()
1064 # Python used to crash on this at exit because of a refcount
1065 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001066
Martin v. Löwis2548c732003-04-18 10:39:54 +00001067# From RFC 3492
1068punycode_testcases = [
1069 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001070 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1071 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001072 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001073 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001074 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001075 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001077 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001078 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001079 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001080 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1081 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1082 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001083 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001084 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001085 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1086 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1087 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001088 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001090 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001091 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1092 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1093 "\u0939\u0948\u0902",
1094 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095
1096 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001097 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001098 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1099 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100
1101 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001102 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1103 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1104 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001105 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1106 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001107
1108 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001109 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1110 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1111 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1112 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001113 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001114
1115 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001116 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1117 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1118 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1119 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1120 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001121 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122
1123 # (K) Vietnamese:
1124 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1125 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001126 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1127 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1128 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1129 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001130 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131
Martin v. Löwis2548c732003-04-18 10:39:54 +00001132 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001133 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001134 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001135
Martin v. Löwis2548c732003-04-18 10:39:54 +00001136 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001137 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1138 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1139 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001140 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141
1142 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001143 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1144 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1145 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001146 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001147
1148 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001150 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001151
1152 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001153 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1154 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001155 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156
1157 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001158 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001159 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001160
1161 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001163 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164
1165 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001166 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1167 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001168 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 ]
1170
1171for i in punycode_testcases:
1172 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001173 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001174
1175class PunycodeTest(unittest.TestCase):
1176 def test_encode(self):
1177 for uni, puny in punycode_testcases:
1178 # Need to convert both strings to lower case, since
1179 # some of the extended encodings use upper case, but our
1180 # code produces only lower case. Converting just puny to
1181 # lower is also insufficient, since some of the input characters
1182 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001183 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001184 str(uni.encode("punycode"), "ascii").lower(),
1185 str(puny, "ascii").lower()
1186 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187
1188 def test_decode(self):
1189 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001190 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001191 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001192 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001194class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001195 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001196 def test_bug1251300(self):
1197 # Decoding with unicode_internal used to not correctly handle "code
1198 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001199 ok = [
1200 (b"\x00\x10\xff\xff", "\U0010ffff"),
1201 (b"\x00\x00\x01\x01", "\U00000101"),
1202 (b"", ""),
1203 ]
1204 not_ok = [
1205 b"\x7f\xff\xff\xff",
1206 b"\x80\x00\x00\x00",
1207 b"\x81\x00\x00\x00",
1208 b"\x00",
1209 b"\x00\x00\x00\x00\x00",
1210 ]
1211 for internal, uni in ok:
1212 if sys.byteorder == "little":
1213 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001214 with support.check_warnings():
1215 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001216 for internal in not_ok:
1217 if sys.byteorder == "little":
1218 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001219 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001220 'deprecated', DeprecationWarning)):
1221 self.assertRaises(UnicodeDecodeError, internal.decode,
1222 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001223 if sys.byteorder == "little":
1224 invalid = b"\x00\x00\x11\x00"
1225 else:
1226 invalid = b"\x00\x11\x00\x00"
1227 with support.check_warnings():
1228 self.assertRaises(UnicodeDecodeError,
1229 invalid.decode, "unicode_internal")
1230 with support.check_warnings():
1231 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1232 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001233
Victor Stinner182d90d2011-09-29 19:53:55 +02001234 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001235 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001236 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001237 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001238 'deprecated', DeprecationWarning)):
1239 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001240 except UnicodeDecodeError as ex:
1241 self.assertEqual("unicode_internal", ex.encoding)
1242 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1243 self.assertEqual(4, ex.start)
1244 self.assertEqual(8, ex.end)
1245 else:
1246 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001247
Victor Stinner182d90d2011-09-29 19:53:55 +02001248 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001249 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001250 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1251 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001252 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001253 'deprecated', DeprecationWarning)):
1254 ab = "ab".encode("unicode_internal").decode()
1255 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1256 "ascii"),
1257 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001258 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001259
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001260 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001261 with support.check_warnings(('unicode_internal codec has been '
1262 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001263 # Issue 3739
1264 encoder = codecs.getencoder("unicode_internal")
1265 self.assertEqual(encoder("a")[1], 1)
1266 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1267
1268 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001269
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1271nameprep_tests = [
1272 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001273 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1274 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1275 b'\xb8\x8f\xef\xbb\xbf',
1276 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001277 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001278 (b'CAFE',
1279 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280 # 3.3 Case folding 8bit U+00DF (german sharp s).
1281 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001282 (b'\xc3\x9f',
1283 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001284 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 (b'\xc4\xb0',
1286 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001287 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'\xc5\x83\xcd\xba',
1289 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001290 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1291 # XXX: skip this as it fails in UCS-2 mode
1292 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1293 # 'telc\xe2\x88\x95kg\xcf\x83'),
1294 (None, None),
1295 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001296 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1297 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001298 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001299 (b'\xe1\xbe\xb7',
1300 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001301 # 3.9 Self-reverting case folding U+01F0 and normalization.
1302 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001303 (b'\xc7\xb0',
1304 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001306 (b'\xce\x90',
1307 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001308 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001309 (b'\xce\xb0',
1310 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001311 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001312 (b'\xe1\xba\x96',
1313 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 (b'\xe1\xbd\x96',
1316 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001317 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001318 (b' ',
1319 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001320 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001321 (b'\xc2\xa0',
1322 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001323 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001324 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001325 None),
1326 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001327 (b'\xe2\x80\x80',
1328 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001329 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001330 (b'\xe2\x80\x8b',
1331 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001332 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 (b'\xe3\x80\x80',
1334 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001335 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001336 (b'\x10\x7f',
1337 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001338 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001339 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001340 None),
1341 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001342 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001343 None),
1344 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001345 (b'\xef\xbb\xbf',
1346 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001347 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001348 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001349 None),
1350 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001351 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 None),
1353 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001354 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 None),
1356 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001357 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 None),
1359 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001360 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 None),
1362 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001363 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001364 None),
1365 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001366 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001367 None),
1368 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001369 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 None),
1371 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001372 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001373 None),
1374 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001375 (b'\xcd\x81',
1376 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001377 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001378 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001379 None),
1380 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001381 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001382 None),
1383 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001384 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001385 None),
1386 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001387 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001388 None),
1389 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001390 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001391 None),
1392 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001393 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001394 None),
1395 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001396 (b'foo\xef\xb9\xb6bar',
1397 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001398 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001399 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 None),
1401 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001402 (b'\xd8\xa71\xd8\xa8',
1403 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001404 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001405 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001406 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001407 # None),
1408 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001409 # 3.44 Larger test (shrinking).
1410 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001411 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1412 b'\xaa\xce\xb0\xe2\x80\x80',
1413 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 # 3.45 Larger test (expanding).
1415 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1417 b'\x80',
1418 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1419 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1420 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 ]
1422
1423
1424class NameprepTest(unittest.TestCase):
1425 def test_nameprep(self):
1426 from encodings.idna import nameprep
1427 for pos, (orig, prepped) in enumerate(nameprep_tests):
1428 if orig is None:
1429 # Skipped
1430 continue
1431 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001432 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001433 if prepped is None:
1434 # Input contains prohibited characters
1435 self.assertRaises(UnicodeError, nameprep, orig)
1436 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001437 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001438 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001439 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001440 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001441 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001442
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001443class IDNACodecTest(unittest.TestCase):
1444 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001445 self.assertEqual(str(b"python.org", "idna"), "python.org")
1446 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1447 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1448 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001449
1450 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001451 self.assertEqual("python.org".encode("idna"), b"python.org")
1452 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1453 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1454 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001455
Martin v. Löwis8b595142005-08-25 11:03:38 +00001456 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001457 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001458 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001459 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001461 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001462 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001463 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001464 "python.org"
1465 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001466 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001467 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001468 "python.org."
1469 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001470 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001471 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001472 "pyth\xf6n.org."
1473 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001474 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001475 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001476 "pyth\xf6n.org."
1477 )
1478
1479 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001480 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1481 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1482 self.assertEqual(decoder.decode(b"rg"), "")
1483 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001484
1485 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001486 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1487 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1488 self.assertEqual(decoder.decode(b"rg."), "org.")
1489 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001490
1491 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001492 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001493 b"".join(codecs.iterencode("python.org", "idna")),
1494 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001495 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001496 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001497 b"".join(codecs.iterencode("python.org.", "idna")),
1498 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001499 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001500 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001501 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1502 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001503 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001504 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001505 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1506 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001507 )
1508
1509 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001510 self.assertEqual(encoder.encode("\xe4x"), b"")
1511 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1512 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001513
1514 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001515 self.assertEqual(encoder.encode("\xe4x"), b"")
1516 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1517 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001518
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001519class CodecsModuleTest(unittest.TestCase):
1520
1521 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001522 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1523 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001524 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001525 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001526 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001527
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001528 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001529 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1530 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001531 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001532 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001534 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001535
1536 def test_register(self):
1537 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001538 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001539
1540 def test_lookup(self):
1541 self.assertRaises(TypeError, codecs.lookup)
1542 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001543 self.assertRaises(LookupError, codecs.lookup, " ")
1544
1545 def test_getencoder(self):
1546 self.assertRaises(TypeError, codecs.getencoder)
1547 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1548
1549 def test_getdecoder(self):
1550 self.assertRaises(TypeError, codecs.getdecoder)
1551 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1552
1553 def test_getreader(self):
1554 self.assertRaises(TypeError, codecs.getreader)
1555 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1556
1557 def test_getwriter(self):
1558 self.assertRaises(TypeError, codecs.getwriter)
1559 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001560
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001561 def test_lookup_issue1813(self):
1562 # Issue #1813: under Turkish locales, lookup of some codecs failed
1563 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001564 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001565 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1566 try:
1567 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1568 except locale.Error:
1569 # Unsupported locale on this system
1570 self.skipTest('test needs Turkish locale')
1571 c = codecs.lookup('ASCII')
1572 self.assertEqual(c.name, 'ascii')
1573
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001574class StreamReaderTest(unittest.TestCase):
1575
1576 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001577 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001578 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001579
1580 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001581 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001582 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001583
Thomas Wouters89f507f2006-12-13 04:49:30 +00001584class EncodedFileTest(unittest.TestCase):
1585
1586 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001587 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001588 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001589 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001590
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001591 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001592 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001593 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001595
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001596all_unicode_encodings = [
1597 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001598 "big5",
1599 "big5hkscs",
1600 "charmap",
1601 "cp037",
1602 "cp1006",
1603 "cp1026",
1604 "cp1140",
1605 "cp1250",
1606 "cp1251",
1607 "cp1252",
1608 "cp1253",
1609 "cp1254",
1610 "cp1255",
1611 "cp1256",
1612 "cp1257",
1613 "cp1258",
1614 "cp424",
1615 "cp437",
1616 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001617 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001618 "cp737",
1619 "cp775",
1620 "cp850",
1621 "cp852",
1622 "cp855",
1623 "cp856",
1624 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001625 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001626 "cp860",
1627 "cp861",
1628 "cp862",
1629 "cp863",
1630 "cp864",
1631 "cp865",
1632 "cp866",
1633 "cp869",
1634 "cp874",
1635 "cp875",
1636 "cp932",
1637 "cp949",
1638 "cp950",
1639 "euc_jis_2004",
1640 "euc_jisx0213",
1641 "euc_jp",
1642 "euc_kr",
1643 "gb18030",
1644 "gb2312",
1645 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001646 "hp_roman8",
1647 "hz",
1648 "idna",
1649 "iso2022_jp",
1650 "iso2022_jp_1",
1651 "iso2022_jp_2",
1652 "iso2022_jp_2004",
1653 "iso2022_jp_3",
1654 "iso2022_jp_ext",
1655 "iso2022_kr",
1656 "iso8859_1",
1657 "iso8859_10",
1658 "iso8859_11",
1659 "iso8859_13",
1660 "iso8859_14",
1661 "iso8859_15",
1662 "iso8859_16",
1663 "iso8859_2",
1664 "iso8859_3",
1665 "iso8859_4",
1666 "iso8859_5",
1667 "iso8859_6",
1668 "iso8859_7",
1669 "iso8859_8",
1670 "iso8859_9",
1671 "johab",
1672 "koi8_r",
1673 "koi8_u",
1674 "latin_1",
1675 "mac_cyrillic",
1676 "mac_greek",
1677 "mac_iceland",
1678 "mac_latin2",
1679 "mac_roman",
1680 "mac_turkish",
1681 "palmos",
1682 "ptcp154",
1683 "punycode",
1684 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001685 "shift_jis",
1686 "shift_jis_2004",
1687 "shift_jisx0213",
1688 "tis_620",
1689 "unicode_escape",
1690 "unicode_internal",
1691 "utf_16",
1692 "utf_16_be",
1693 "utf_16_le",
1694 "utf_7",
1695 "utf_8",
1696]
1697
1698if hasattr(codecs, "mbcs_encode"):
1699 all_unicode_encodings.append("mbcs")
1700
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001701# The following encoding is not tested, because it's not supposed
1702# to work:
1703# "undefined"
1704
1705# The following encodings don't work in stateful mode
1706broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001707 "punycode",
1708 "unicode_internal"
1709]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001710broken_incremental_coders = broken_unicode_with_streams + [
1711 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001712]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001713
Walter Dörwald3abcb012007-04-16 22:10:50 +00001714class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001715 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001716 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001717 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001718 name = codecs.lookup(encoding).name
1719 if encoding.endswith("_codec"):
1720 name += "_codec"
1721 elif encoding == "latin_1":
1722 name = "latin_1"
1723 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001724
Ezio Melottiadc417c2011-11-17 12:23:34 +02001725 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001726 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001727 (b, size) = codecs.getencoder(encoding)(s)
1728 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1729 (chars, size) = codecs.getdecoder(encoding)(b)
1730 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001731
1732 if encoding not in broken_unicode_with_streams:
1733 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001734 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001735 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001736 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001737 for c in s:
1738 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001739 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001740 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001741 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001742 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001743 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001744 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001745 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001746 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001747 decodedresult += reader.read()
1748 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1749
Thomas Wouters89f507f2006-12-13 04:49:30 +00001750 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001751 # check incremental decoder/encoder (fetched via the Python
1752 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001753 try:
1754 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001755 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001756 except LookupError: # no IncrementalEncoder
1757 pass
1758 else:
1759 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001760 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001761 for c in s:
1762 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001763 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001764 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001765 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001766 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001767 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001768 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001769 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1770
1771 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001772 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001773 for c in s:
1774 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001775 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001776 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001777 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001778 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001779 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001780 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001781 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1782
1783 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001784 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001785 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1786
1787 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001788 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1789 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001790
Victor Stinner554f3f02010-06-16 23:33:54 +00001791 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001792 # check incremental decoder/encoder with errors argument
1793 try:
1794 encoder = codecs.getincrementalencoder(encoding)("ignore")
1795 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1796 except LookupError: # no IncrementalEncoder
1797 pass
1798 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001799 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001800 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001801 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001802 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1803
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001804 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001805 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001806 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001807 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1808
Walter Dörwald729c31f2005-03-14 19:06:30 +00001809 def test_seek(self):
1810 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001811 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001812 for encoding in all_unicode_encodings:
1813 if encoding == "idna": # FIXME: See SF bug #1163178
1814 continue
1815 if encoding in broken_unicode_with_streams:
1816 continue
Victor Stinner05010702011-05-27 16:50:40 +02001817 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001818 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001819 # Test that calling seek resets the internal codec state and buffers
1820 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001821 data = reader.read()
1822 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001823
Walter Dörwalde22d3392005-11-17 08:52:34 +00001824 def test_bad_decode_args(self):
1825 for encoding in all_unicode_encodings:
1826 decoder = codecs.getdecoder(encoding)
1827 self.assertRaises(TypeError, decoder)
1828 if encoding not in ("idna", "punycode"):
1829 self.assertRaises(TypeError, decoder, 42)
1830
1831 def test_bad_encode_args(self):
1832 for encoding in all_unicode_encodings:
1833 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001834 with support.check_warnings():
1835 # unicode-internal has been deprecated
1836 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001837
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001838 def test_encoding_map_type_initialized(self):
1839 from encodings import cp1140
1840 # This used to crash, we are only verifying there's no crash.
1841 table_type = type(cp1140.encoding_table)
1842 self.assertEqual(table_type, table_type)
1843
Walter Dörwald3abcb012007-04-16 22:10:50 +00001844 def test_decoder_state(self):
1845 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001846 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001847 for encoding in all_unicode_encodings:
1848 if encoding not in broken_incremental_coders:
1849 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1850 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1851
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001852class CharmapTest(unittest.TestCase):
1853 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001854 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001855 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001856 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001857 )
1858
Ezio Melottib3aedd42010-11-20 19:04:17 +00001859 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001860 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1861 ("\U0010FFFFbc", 3)
1862 )
1863
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001864 self.assertRaises(UnicodeDecodeError,
1865 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1866 )
1867
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001868 self.assertRaises(UnicodeDecodeError,
1869 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1870 )
1871
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001872 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001873 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001874 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001875 )
1876
Ezio Melottib3aedd42010-11-20 19:04:17 +00001877 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001878 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001879 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001880 )
1881
Ezio Melottib3aedd42010-11-20 19:04:17 +00001882 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001883 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001884 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001885 )
1886
Ezio Melottib3aedd42010-11-20 19:04:17 +00001887 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001888 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001889 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001890 )
1891
Guido van Rossum805365e2007-05-07 22:24:25 +00001892 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001893 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001894 codecs.charmap_decode(allbytes, "ignore", ""),
1895 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001896 )
1897
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001898 def test_decode_with_int2str_map(self):
1899 self.assertEqual(
1900 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1901 {0: 'a', 1: 'b', 2: 'c'}),
1902 ("abc", 3)
1903 )
1904
1905 self.assertEqual(
1906 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1907 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1908 ("AaBbCc", 3)
1909 )
1910
1911 self.assertEqual(
1912 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1913 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1914 ("\U0010FFFFbc", 3)
1915 )
1916
1917 self.assertEqual(
1918 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1919 {0: 'a', 1: 'b', 2: ''}),
1920 ("ab", 3)
1921 )
1922
1923 self.assertRaises(UnicodeDecodeError,
1924 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1925 {0: 'a', 1: 'b'}
1926 )
1927
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001928 self.assertRaises(UnicodeDecodeError,
1929 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1930 {0: 'a', 1: 'b', 2: None}
1931 )
1932
1933 # Issue #14850
1934 self.assertRaises(UnicodeDecodeError,
1935 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1936 {0: 'a', 1: 'b', 2: '\ufffe'}
1937 )
1938
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001939 self.assertEqual(
1940 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1941 {0: 'a', 1: 'b'}),
1942 ("ab\ufffd", 3)
1943 )
1944
1945 self.assertEqual(
1946 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1947 {0: 'a', 1: 'b', 2: None}),
1948 ("ab\ufffd", 3)
1949 )
1950
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001951 # Issue #14850
1952 self.assertEqual(
1953 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1954 {0: 'a', 1: 'b', 2: '\ufffe'}),
1955 ("ab\ufffd", 3)
1956 )
1957
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001958 self.assertEqual(
1959 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1960 {0: 'a', 1: 'b'}),
1961 ("ab", 3)
1962 )
1963
1964 self.assertEqual(
1965 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1966 {0: 'a', 1: 'b', 2: None}),
1967 ("ab", 3)
1968 )
1969
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001970 # Issue #14850
1971 self.assertEqual(
1972 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1973 {0: 'a', 1: 'b', 2: '\ufffe'}),
1974 ("ab", 3)
1975 )
1976
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001977 allbytes = bytes(range(256))
1978 self.assertEqual(
1979 codecs.charmap_decode(allbytes, "ignore", {}),
1980 ("", len(allbytes))
1981 )
1982
1983 def test_decode_with_int2int_map(self):
1984 a = ord('a')
1985 b = ord('b')
1986 c = ord('c')
1987
1988 self.assertEqual(
1989 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1990 {0: a, 1: b, 2: c}),
1991 ("abc", 3)
1992 )
1993
1994 # Issue #15379
1995 self.assertEqual(
1996 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1997 {0: 0x10FFFF, 1: b, 2: c}),
1998 ("\U0010FFFFbc", 3)
1999 )
2000
Antoine Pitroua1f76552012-09-23 20:00:04 +02002001 self.assertEqual(
2002 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2003 {0: sys.maxunicode, 1: b, 2: c}),
2004 (chr(sys.maxunicode) + "bc", 3)
2005 )
2006
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002007 self.assertRaises(TypeError,
2008 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002009 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002010 )
2011
2012 self.assertRaises(UnicodeDecodeError,
2013 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2014 {0: a, 1: b},
2015 )
2016
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002017 self.assertRaises(UnicodeDecodeError,
2018 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2019 {0: a, 1: b, 2: 0xFFFE},
2020 )
2021
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002022 self.assertEqual(
2023 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2024 {0: a, 1: b}),
2025 ("ab\ufffd", 3)
2026 )
2027
2028 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002029 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2030 {0: a, 1: b, 2: 0xFFFE}),
2031 ("ab\ufffd", 3)
2032 )
2033
2034 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002035 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2036 {0: a, 1: b}),
2037 ("ab", 3)
2038 )
2039
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002040 self.assertEqual(
2041 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2042 {0: a, 1: b, 2: 0xFFFE}),
2043 ("ab", 3)
2044 )
2045
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002046
Thomas Wouters89f507f2006-12-13 04:49:30 +00002047class WithStmtTest(unittest.TestCase):
2048 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002049 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002050 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2051 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002052
2053 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002054 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002055 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002056 with codecs.StreamReaderWriter(f, info.streamreader,
2057 info.streamwriter, 'strict') as srw:
2058 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002059
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002060class TypesTest(unittest.TestCase):
2061 def test_decode_unicode(self):
2062 # Most decoders don't accept unicode input
2063 decoders = [
2064 codecs.utf_7_decode,
2065 codecs.utf_8_decode,
2066 codecs.utf_16_le_decode,
2067 codecs.utf_16_be_decode,
2068 codecs.utf_16_ex_decode,
2069 codecs.utf_32_decode,
2070 codecs.utf_32_le_decode,
2071 codecs.utf_32_be_decode,
2072 codecs.utf_32_ex_decode,
2073 codecs.latin_1_decode,
2074 codecs.ascii_decode,
2075 codecs.charmap_decode,
2076 ]
2077 if hasattr(codecs, "mbcs_decode"):
2078 decoders.append(codecs.mbcs_decode)
2079 for decoder in decoders:
2080 self.assertRaises(TypeError, decoder, "xxx")
2081
2082 def test_unicode_escape(self):
2083 # Escape-decoding an unicode string is supported ang gives the same
2084 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002085 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2086 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2087 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2088 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002089
Victor Stinnere3b47152011-12-09 20:49:49 +01002090 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2091 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2092
2093 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2094 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2095
Serhiy Storchakad6793772013-01-29 10:20:44 +02002096
2097class UnicodeEscapeTest(unittest.TestCase):
2098 def test_empty(self):
2099 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2100 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2101
2102 def test_raw_encode(self):
2103 encode = codecs.unicode_escape_encode
2104 for b in range(32, 127):
2105 if b != b'\\'[0]:
2106 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2107
2108 def test_raw_decode(self):
2109 decode = codecs.unicode_escape_decode
2110 for b in range(256):
2111 if b != b'\\'[0]:
2112 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2113
2114 def test_escape_encode(self):
2115 encode = codecs.unicode_escape_encode
2116 check = coding_checker(self, encode)
2117 check('\t', br'\t')
2118 check('\n', br'\n')
2119 check('\r', br'\r')
2120 check('\\', br'\\')
2121 for b in range(32):
2122 if chr(b) not in '\t\n\r':
2123 check(chr(b), ('\\x%02x' % b).encode())
2124 for b in range(127, 256):
2125 check(chr(b), ('\\x%02x' % b).encode())
2126 check('\u20ac', br'\u20ac')
2127 check('\U0001d120', br'\U0001d120')
2128
2129 def test_escape_decode(self):
2130 decode = codecs.unicode_escape_decode
2131 check = coding_checker(self, decode)
2132 check(b"[\\\n]", "[]")
2133 check(br'[\"]', '["]')
2134 check(br"[\']", "[']")
2135 check(br"[\\]", r"[\]")
2136 check(br"[\a]", "[\x07]")
2137 check(br"[\b]", "[\x08]")
2138 check(br"[\t]", "[\x09]")
2139 check(br"[\n]", "[\x0a]")
2140 check(br"[\v]", "[\x0b]")
2141 check(br"[\f]", "[\x0c]")
2142 check(br"[\r]", "[\x0d]")
2143 check(br"[\7]", "[\x07]")
2144 check(br"[\8]", r"[\8]")
2145 check(br"[\78]", "[\x078]")
2146 check(br"[\41]", "[!]")
2147 check(br"[\418]", "[!8]")
2148 check(br"[\101]", "[A]")
2149 check(br"[\1010]", "[A0]")
2150 check(br"[\x41]", "[A]")
2151 check(br"[\x410]", "[A0]")
2152 check(br"\u20ac", "\u20ac")
2153 check(br"\U0001d120", "\U0001d120")
2154 for b in range(256):
2155 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2156 check(b'\\' + bytes([b]), '\\' + chr(b))
2157
2158 def test_decode_errors(self):
2159 decode = codecs.unicode_escape_decode
2160 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2161 for i in range(d):
2162 self.assertRaises(UnicodeDecodeError, decode,
2163 b"\\" + c + b"0"*i)
2164 self.assertRaises(UnicodeDecodeError, decode,
2165 b"[\\" + c + b"0"*i + b"]")
2166 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2167 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2168 self.assertEqual(decode(data, "replace"),
2169 ("[\ufffd]\ufffd", len(data)))
2170 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2171 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2172 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2173
2174
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002175class RawUnicodeEscapeTest(unittest.TestCase):
2176 def test_empty(self):
2177 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2178 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2179
2180 def test_raw_encode(self):
2181 encode = codecs.raw_unicode_escape_encode
2182 for b in range(256):
2183 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2184
2185 def test_raw_decode(self):
2186 decode = codecs.raw_unicode_escape_decode
2187 for b in range(256):
2188 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2189
2190 def test_escape_encode(self):
2191 encode = codecs.raw_unicode_escape_encode
2192 check = coding_checker(self, encode)
2193 for b in range(256):
2194 if b not in b'uU':
2195 check('\\' + chr(b), b'\\' + bytes([b]))
2196 check('\u20ac', br'\u20ac')
2197 check('\U0001d120', br'\U0001d120')
2198
2199 def test_escape_decode(self):
2200 decode = codecs.raw_unicode_escape_decode
2201 check = coding_checker(self, decode)
2202 for b in range(256):
2203 if b not in b'uU':
2204 check(b'\\' + bytes([b]), '\\' + chr(b))
2205 check(br"\u20ac", "\u20ac")
2206 check(br"\U0001d120", "\U0001d120")
2207
2208 def test_decode_errors(self):
2209 decode = codecs.raw_unicode_escape_decode
2210 for c, d in (b'u', 4), (b'U', 4):
2211 for i in range(d):
2212 self.assertRaises(UnicodeDecodeError, decode,
2213 b"\\" + c + b"0"*i)
2214 self.assertRaises(UnicodeDecodeError, decode,
2215 b"[\\" + c + b"0"*i + b"]")
2216 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2217 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2218 self.assertEqual(decode(data, "replace"),
2219 ("[\ufffd]\ufffd", len(data)))
2220 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2221 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2222 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2223
2224
Martin v. Löwis43c57782009-05-10 08:15:24 +00002225class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002226
2227 def test_utf8(self):
2228 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002229 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002230 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002231 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002232 b"foo\x80bar")
2233 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002234 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002235 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002236 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002237 b"\xed\xb0\x80")
2238
2239 def test_ascii(self):
2240 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002241 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002242 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002243 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002244 b"foo\x80bar")
2245
2246 def test_charmap(self):
2247 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002248 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002249 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002250 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002251 b"foo\xa5bar")
2252
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002253 def test_latin1(self):
2254 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002255 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002256 b"\xe4\xeb\xef\xf6\xfc")
2257
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002258
Victor Stinner3fed0872010-05-22 02:16:27 +00002259class BomTest(unittest.TestCase):
2260 def test_seek0(self):
2261 data = "1234567890"
2262 tests = ("utf-16",
2263 "utf-16-le",
2264 "utf-16-be",
2265 "utf-32",
2266 "utf-32-le",
2267 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002268 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002269 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002270 # Check if the BOM is written only once
2271 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002272 f.write(data)
2273 f.write(data)
2274 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002275 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002276 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002277 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002278
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002279 # Check that the BOM is written after a seek(0)
2280 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2281 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002282 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002283 f.seek(0)
2284 f.write(data)
2285 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002286 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002287
2288 # (StreamWriter) Check that the BOM is written after a seek(0)
2289 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002290 f.writer.write(data[0])
2291 self.assertNotEqual(f.writer.tell(), 0)
2292 f.writer.seek(0)
2293 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002294 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002295 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002296
Victor Stinner05010702011-05-27 16:50:40 +02002297 # Check that the BOM is not written after a seek() at a position
2298 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002299 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2300 f.write(data)
2301 f.seek(f.tell())
2302 f.write(data)
2303 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002304 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002305
Victor Stinner05010702011-05-27 16:50:40 +02002306 # (StreamWriter) Check that the BOM is not written after a seek()
2307 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002308 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002309 f.writer.write(data)
2310 f.writer.seek(f.writer.tell())
2311 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002312 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002313 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002314
Victor Stinner3fed0872010-05-22 02:16:27 +00002315
Georg Brandl02524622010-12-02 18:06:51 +00002316bytes_transform_encodings = [
2317 "base64_codec",
2318 "uu_codec",
2319 "quopri_codec",
2320 "hex_codec",
2321]
2322try:
2323 import zlib
2324except ImportError:
2325 pass
2326else:
2327 bytes_transform_encodings.append("zlib_codec")
2328try:
2329 import bz2
2330except ImportError:
2331 pass
2332else:
2333 bytes_transform_encodings.append("bz2_codec")
2334
2335class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002336
Georg Brandl02524622010-12-02 18:06:51 +00002337 def test_basics(self):
2338 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002339 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002340 with self.subTest(encoding=encoding):
2341 # generic codecs interface
2342 (o, size) = codecs.getencoder(encoding)(binput)
2343 self.assertEqual(size, len(binput))
2344 (i, size) = codecs.getdecoder(encoding)(o)
2345 self.assertEqual(size, len(o))
2346 self.assertEqual(i, binput)
Georg Brandl02524622010-12-02 18:06:51 +00002347
Georg Brandl02524622010-12-02 18:06:51 +00002348 def test_read(self):
2349 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002350 with self.subTest(encoding=encoding):
2351 sin = codecs.encode(b"\x80", encoding)
2352 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2353 sout = reader.read()
2354 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002355
2356 def test_readline(self):
2357 for encoding in bytes_transform_encodings:
2358 if encoding in ['uu_codec', 'zlib_codec']:
2359 continue
Nick Coghlan8b097b42013-11-13 23:49:21 +10002360 with self.subTest(encoding=encoding):
2361 sin = codecs.encode(b"\x80", encoding)
2362 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2363 sout = reader.readline()
2364 self.assertEqual(sout, b"\x80")
Georg Brandl02524622010-12-02 18:06:51 +00002365
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002366 def test_buffer_api_usage(self):
2367 # We check all the transform codecs accept memoryview input
2368 # for encoding and decoding
2369 # and also that they roundtrip correctly
2370 original = b"12345\x80"
2371 for encoding in bytes_transform_encodings:
Nick Coghlan8b097b42013-11-13 23:49:21 +10002372 with self.subTest(encoding=encoding):
2373 data = original
2374 view = memoryview(data)
2375 data = codecs.encode(data, encoding)
2376 view_encoded = codecs.encode(view, encoding)
2377 self.assertEqual(view_encoded, data)
2378 view = memoryview(data)
2379 data = codecs.decode(data, encoding)
2380 self.assertEqual(data, original)
2381 view_decoded = codecs.decode(view, encoding)
2382 self.assertEqual(view_decoded, data)
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002383
Nick Coghlan8b097b42013-11-13 23:49:21 +10002384 def test_type_error_for_text_input(self):
2385 # Check binary -> binary codecs give a good error for str input
2386 bad_input = "bad input type"
2387 for encoding in bytes_transform_encodings:
2388 with self.subTest(encoding=encoding):
2389 msg = "^encoding with '{}' codec failed".format(encoding)
2390 with self.assertRaisesRegex(TypeError, msg) as failure:
2391 bad_input.encode(encoding)
2392 self.assertTrue(isinstance(failure.exception.__cause__,
2393 TypeError))
2394
2395 def test_type_error_for_binary_input(self):
2396 # Check str -> str codec gives a good error for binary input
2397 for bad_input in (b"immutable", bytearray(b"mutable")):
2398 with self.subTest(bad_input=bad_input):
2399 msg = "^decoding with 'rot_13' codec failed"
2400 with self.assertRaisesRegex(AttributeError, msg) as failure:
2401 bad_input.decode("rot_13")
2402 self.assertTrue(isinstance(failure.exception.__cause__,
2403 AttributeError))
2404
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002405 def test_custom_zlib_error_is_wrapped(self):
2406 # Check zlib codec gives a good error for malformed input
2407 msg = "^decoding with 'zlib_codec' codec failed"
2408 with self.assertRaisesRegex(Exception, msg) as failure:
2409 b"hello".decode("zlib_codec")
2410 self.assertTrue(isinstance(failure.exception.__cause__,
2411 type(failure.exception)))
2412
2413 def test_custom_hex_error_is_wrapped(self):
2414 # Check hex codec gives a good error for malformed input
2415 msg = "^decoding with 'hex_codec' codec failed"
2416 with self.assertRaisesRegex(Exception, msg) as failure:
2417 b"hello".decode("hex_codec")
2418 self.assertTrue(isinstance(failure.exception.__cause__,
2419 type(failure.exception)))
2420
2421 # Unfortunately, the bz2 module throws OSError, which the codec
2422 # machinery currently can't wrap :(
2423
Nick Coghlan8b097b42013-11-13 23:49:21 +10002424 def test_bad_decoding_output_type(self):
2425 # Check bytes.decode and bytearray.decode give a good error
2426 # message for binary -> binary codecs
2427 data = b"encode first to ensure we meet any format restrictions"
2428 for encoding in bytes_transform_encodings:
2429 with self.subTest(encoding=encoding):
2430 encoded_data = codecs.encode(data, encoding)
2431 fmt = ("'{}' decoder returned 'bytes' instead of 'str'; "
2432 "use codecs.decode\(\) to decode to arbitrary types")
2433 msg = fmt.format(encoding)
2434 with self.assertRaisesRegex(TypeError, msg):
2435 encoded_data.decode(encoding)
2436 with self.assertRaisesRegex(TypeError, msg):
2437 bytearray(encoded_data).decode(encoding)
2438
2439 def test_bad_encoding_output_type(self):
2440 # Check str.encode gives a good error message for str -> str codecs
2441 msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; "
2442 "use codecs.encode\(\) to encode to arbitrary types")
2443 with self.assertRaisesRegex(TypeError, msg):
2444 "just an example message".encode("rot_13")
2445
2446
2447# The codec system tries to wrap exceptions in order to ensure the error
2448# mentions the operation being performed and the codec involved. We
2449# currently *only* want this to happen for relatively stateless
2450# exceptions, where the only significant information they contain is their
2451# type and a single str argument.
Nick Coghlan4e553e22013-11-16 00:35:34 +10002452
2453# Use a local codec registry to avoid appearing to leak objects when
2454# registering multiple seach functions
2455_TEST_CODECS = {}
2456
2457def _get_test_codec(codec_name):
2458 return _TEST_CODECS.get(codec_name)
2459codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2460
Nick Coghlan8b097b42013-11-13 23:49:21 +10002461class ExceptionChainingTest(unittest.TestCase):
2462
2463 def setUp(self):
2464 # There's no way to unregister a codec search function, so we just
2465 # ensure we render this one fairly harmless after the test
2466 # case finishes by using the test case repr as the codec name
2467 # The codecs module normalizes codec names, although this doesn't
2468 # appear to be formally documented...
2469 self.codec_name = repr(self).lower().replace(" ", "-")
Nick Coghlan8b097b42013-11-13 23:49:21 +10002470
Nick Coghlan4e553e22013-11-16 00:35:34 +10002471 def tearDown(self):
2472 _TEST_CODECS.pop(self.codec_name, None)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002473
2474 def set_codec(self, obj_to_raise):
2475 def raise_obj(*args, **kwds):
2476 raise obj_to_raise
Nick Coghlan4e553e22013-11-16 00:35:34 +10002477 codec_info = codecs.CodecInfo(raise_obj, raise_obj,
2478 name=self.codec_name)
2479 _TEST_CODECS[self.codec_name] = codec_info
Nick Coghlan8b097b42013-11-13 23:49:21 +10002480
2481 @contextlib.contextmanager
2482 def assertWrapped(self, operation, exc_type, msg):
2483 full_msg = "{} with '{}' codec failed \({}: {}\)".format(
2484 operation, self.codec_name, exc_type.__name__, msg)
2485 with self.assertRaisesRegex(exc_type, full_msg) as caught:
2486 yield caught
2487
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002488 def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002489 self.set_codec(obj_to_raise)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002490 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002491 "str_input".encode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002492 with self.assertWrapped("encoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002493 codecs.encode("str_input", self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002494 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002495 b"bytes input".decode(self.codec_name)
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002496 with self.assertWrapped("decoding", exc_type, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002497 codecs.decode(b"bytes input", self.codec_name)
2498
2499 def test_raise_by_type(self):
2500 self.check_wrapped(RuntimeError, "")
2501
2502 def test_raise_by_value(self):
2503 msg = "This should be wrapped"
2504 self.check_wrapped(RuntimeError(msg), msg)
2505
Nick Coghlanf1de55f2013-11-19 22:33:10 +10002506 def test_raise_grandchild_subclass_exact_size(self):
2507 msg = "This should be wrapped"
2508 class MyRuntimeError(RuntimeError):
2509 __slots__ = ()
2510 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2511
2512 def test_raise_subclass_with_weakref_support(self):
2513 msg = "This should be wrapped"
2514 class MyRuntimeError(RuntimeError):
2515 pass
2516 self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2517
Nick Coghlan8b097b42013-11-13 23:49:21 +10002518 @contextlib.contextmanager
Nick Coghlanc4c25802013-11-15 21:47:37 +10002519 def assertNotWrapped(self, operation, exc_type, msg_re, msg=None):
2520 if msg is None:
2521 msg = msg_re
Nick Coghlan8b097b42013-11-13 23:49:21 +10002522 with self.assertRaisesRegex(exc_type, msg) as caught:
2523 yield caught
Nick Coghlanc4c25802013-11-15 21:47:37 +10002524 self.assertEqual(str(caught.exception), msg)
Nick Coghlan8b097b42013-11-13 23:49:21 +10002525
Nick Coghlanc4c25802013-11-15 21:47:37 +10002526 def check_not_wrapped(self, obj_to_raise, msg_re, msg=None):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002527 self.set_codec(obj_to_raise)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002528 with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002529 "str input".encode(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002530 with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002531 codecs.encode("str input", self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002532 with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002533 b"bytes input".decode(self.codec_name)
Nick Coghlanc4c25802013-11-15 21:47:37 +10002534 with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
Nick Coghlan8b097b42013-11-13 23:49:21 +10002535 codecs.decode(b"bytes input", self.codec_name)
2536
2537 def test_init_override_is_not_wrapped(self):
2538 class CustomInit(RuntimeError):
2539 def __init__(self):
2540 pass
2541 self.check_not_wrapped(CustomInit, "")
2542
2543 def test_new_override_is_not_wrapped(self):
2544 class CustomNew(RuntimeError):
2545 def __new__(cls):
2546 return super().__new__(cls)
2547 self.check_not_wrapped(CustomNew, "")
2548
2549 def test_instance_attribute_is_not_wrapped(self):
2550 msg = "This should NOT be wrapped"
2551 exc = RuntimeError(msg)
2552 exc.attr = 1
2553 self.check_not_wrapped(exc, msg)
2554
2555 def test_non_str_arg_is_not_wrapped(self):
2556 self.check_not_wrapped(RuntimeError(1), "1")
2557
2558 def test_multiple_args_is_not_wrapped(self):
Nick Coghlanc4c25802013-11-15 21:47:37 +10002559 msg_re = "\('a', 'b', 'c'\)"
2560 msg = "('a', 'b', 'c')"
2561 self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re, msg)
2562
2563 # http://bugs.python.org/issue19609
2564 def test_codec_lookup_failure_not_wrapped(self):
2565 msg = "unknown encoding: %s" % self.codec_name
2566 # The initial codec lookup should not be wrapped
2567 with self.assertNotWrapped("encoding", LookupError, msg):
2568 "str input".encode(self.codec_name)
2569 with self.assertNotWrapped("encoding", LookupError, msg):
2570 codecs.encode("str input", self.codec_name)
2571 with self.assertNotWrapped("decoding", LookupError, msg):
2572 b"bytes input".decode(self.codec_name)
2573 with self.assertNotWrapped("decoding", LookupError, msg):
2574 codecs.decode(b"bytes input", self.codec_name)
2575
Nick Coghlanfdf239a2013-10-03 00:43:22 +10002576
Georg Brandl02524622010-12-02 18:06:51 +00002577
Victor Stinner62be4fb2011-10-18 21:46:37 +02002578@unittest.skipUnless(sys.platform == 'win32',
2579 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002580class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002581 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002582 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002583
Victor Stinner3a50e702011-10-18 21:21:00 +02002584 def test_invalid_code_page(self):
2585 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2586 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02002587 self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
2588 self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
Victor Stinner3a50e702011-10-18 21:21:00 +02002589
2590 def test_code_page_name(self):
2591 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2592 codecs.code_page_encode, 932, '\xff')
2593 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2594 codecs.code_page_decode, 932, b'\x81\x00')
2595 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2596 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2597
2598 def check_decode(self, cp, tests):
2599 for raw, errors, expected in tests:
2600 if expected is not None:
2601 try:
2602 decoded = codecs.code_page_decode(cp, raw, errors)
2603 except UnicodeDecodeError as err:
2604 self.fail('Unable to decode %a from "cp%s" with '
2605 'errors=%r: %s' % (raw, cp, errors, err))
2606 self.assertEqual(decoded[0], expected,
2607 '%a.decode("cp%s", %r)=%a != %a'
2608 % (raw, cp, errors, decoded[0], expected))
2609 # assert 0 <= decoded[1] <= len(raw)
2610 self.assertGreaterEqual(decoded[1], 0)
2611 self.assertLessEqual(decoded[1], len(raw))
2612 else:
2613 self.assertRaises(UnicodeDecodeError,
2614 codecs.code_page_decode, cp, raw, errors)
2615
2616 def check_encode(self, cp, tests):
2617 for text, errors, expected in tests:
2618 if expected is not None:
2619 try:
2620 encoded = codecs.code_page_encode(cp, text, errors)
2621 except UnicodeEncodeError as err:
2622 self.fail('Unable to encode %a to "cp%s" with '
2623 'errors=%r: %s' % (text, cp, errors, err))
2624 self.assertEqual(encoded[0], expected,
2625 '%a.encode("cp%s", %r)=%a != %a'
2626 % (text, cp, errors, encoded[0], expected))
2627 self.assertEqual(encoded[1], len(text))
2628 else:
2629 self.assertRaises(UnicodeEncodeError,
2630 codecs.code_page_encode, cp, text, errors)
2631
2632 def test_cp932(self):
2633 self.check_encode(932, (
2634 ('abc', 'strict', b'abc'),
2635 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002636 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002637 ('\xff', 'strict', None),
2638 ('[\xff]', 'ignore', b'[]'),
2639 ('[\xff]', 'replace', b'[y]'),
2640 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002641 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2642 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002643 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002644 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002645 (b'abc', 'strict', 'abc'),
2646 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2647 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002648 (b'[\xff]', 'strict', None),
2649 (b'[\xff]', 'ignore', '[]'),
2650 (b'[\xff]', 'replace', '[\ufffd]'),
2651 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002652 (b'\x81\x00abc', 'strict', None),
2653 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002654 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2655 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002656
2657 def test_cp1252(self):
2658 self.check_encode(1252, (
2659 ('abc', 'strict', b'abc'),
2660 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2661 ('\xff', 'strict', b'\xff'),
2662 ('\u0141', 'strict', None),
2663 ('\u0141', 'ignore', b''),
2664 ('\u0141', 'replace', b'L'),
2665 ))
2666 self.check_decode(1252, (
2667 (b'abc', 'strict', 'abc'),
2668 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2669 (b'\xff', 'strict', '\xff'),
2670 ))
2671
2672 def test_cp_utf7(self):
2673 cp = 65000
2674 self.check_encode(cp, (
2675 ('abc', 'strict', b'abc'),
2676 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2677 ('\U0010ffff', 'strict', b'+2//f/w-'),
2678 ('\udc80', 'strict', b'+3IA-'),
2679 ('\ufffd', 'strict', b'+//0-'),
2680 ))
2681 self.check_decode(cp, (
2682 (b'abc', 'strict', 'abc'),
2683 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2684 (b'+2//f/w-', 'strict', '\U0010ffff'),
2685 (b'+3IA-', 'strict', '\udc80'),
2686 (b'+//0-', 'strict', '\ufffd'),
2687 # invalid bytes
2688 (b'[+/]', 'strict', '[]'),
2689 (b'[\xff]', 'strict', '[\xff]'),
2690 ))
2691
Victor Stinner3a50e702011-10-18 21:21:00 +02002692 def test_multibyte_encoding(self):
2693 self.check_decode(932, (
2694 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2695 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2696 ))
2697 self.check_decode(self.CP_UTF8, (
2698 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2699 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2700 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002701 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002702 self.check_encode(self.CP_UTF8, (
2703 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2704 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2705 ))
2706
2707 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002708 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2709 self.assertEqual(decoded, ('', 0))
2710
Victor Stinner3a50e702011-10-18 21:21:00 +02002711 decoded = codecs.code_page_decode(932,
2712 b'\xe9\x80\xe9', 'strict',
2713 False)
2714 self.assertEqual(decoded, ('\u9a3e', 2))
2715
2716 decoded = codecs.code_page_decode(932,
2717 b'\xe9\x80\xe9\x80', 'strict',
2718 False)
2719 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2720
2721 decoded = codecs.code_page_decode(932,
2722 b'abc', 'strict',
2723 False)
2724 self.assertEqual(decoded, ('abc', 3))
2725
2726
Fred Drake2e2be372001-09-20 21:33:42 +00002727if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002728 unittest.main()