blob: cb618ece8c2e507ba4877534af3e583f218d6bed [file] [log] [blame]
Victor Stinner05010702011-05-27 16:50:40 +02001import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01002import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02003import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01004import sys
5import unittest
6import warnings
7
8from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +02009
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020010if sys.platform == 'win32':
11 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
12else:
13 VISTA_OR_LATER = False
14
Antoine Pitrou00b2c862011-10-05 13:01:41 +020015try:
16 import ctypes
17except ImportError:
18 ctypes = None
19 SIZEOF_WCHAR_T = -1
20else:
21 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000022
Serhiy Storchakad6793772013-01-29 10:20:44 +020023def coding_checker(self, coder):
24 def check(input, expect):
25 self.assertEqual(coder(input), (expect, len(input)))
26 return check
27
Walter Dörwald69652032004-09-07 20:24:22 +000028class Queue(object):
29 """
30 queue: write bytes at one end, read bytes from the other end
31 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000032 def __init__(self, buffer):
33 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000034
35 def write(self, chars):
36 self._buffer += chars
37
38 def read(self, size=-1):
39 if size<0:
40 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000041 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000042 return s
43 else:
44 s = self._buffer[:size]
45 self._buffer = self._buffer[size:]
46 return s
47
Walter Dörwald3abcb012007-04-16 22:10:50 +000048class MixInCheckStateHandling:
49 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000050 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 d = codecs.getincrementaldecoder(encoding)()
52 part1 = d.decode(s[:i])
53 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000054 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 # Check that the condition stated in the documentation for
56 # IncrementalDecoder.getstate() holds
57 if not state[1]:
58 # reset decoder to the default state without anything buffered
59 d.setstate((state[0][:0], 0))
60 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000061 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000062 # The decoder must return to the same state
63 self.assertEqual(state, d.getstate())
64 # Create a new decoder and set it to the state
65 # we extracted from the old one
66 d = codecs.getincrementaldecoder(encoding)()
67 d.setstate(state)
68 part2 = d.decode(s[i:], True)
69 self.assertEqual(u, part1+part2)
70
71 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000072 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000073 d = codecs.getincrementalencoder(encoding)()
74 part1 = d.encode(u[:i])
75 state = d.getstate()
76 d = codecs.getincrementalencoder(encoding)()
77 d.setstate(state)
78 part2 = d.encode(u[i:], True)
79 self.assertEqual(s, part1+part2)
80
Ezio Melotti5d3dba02013-01-11 06:02:07 +020081class ReadTest(MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000083 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000084 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000085 # the StreamReader and check that the results equal the appropriate
86 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020088 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000092 result += r.read()
93 self.assertEqual(result, partialresult)
94 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000095 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000097
Thomas Woutersa9773292006-04-21 09:43:23 +000098 # do the check again, this time using a incremental decoder
99 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000100 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000102 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 self.assertEqual(result, partialresult)
104 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000105 self.assertEqual(d.decode(b"", True), "")
106 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000107
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000110 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000111 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000112 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000113 self.assertEqual(result, partialresult)
114 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000115 self.assertEqual(d.decode(b"", True), "")
116 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000117
118 # check iterdecode()
119 encoded = input.encode(self.encoding)
120 self.assertEqual(
121 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000122 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000123 )
124
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000125 def test_readline(self):
126 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000127 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 return codecs.getreader(self.encoding)(stream)
129
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200131 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000132 lines = []
133 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135 if not line:
136 break
137 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000138 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000140 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
141 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
142 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000143 self.assertEqual(readalllines(s, True), sexpected)
144 self.assertEqual(readalllines(s, False), sexpectednoends)
145 self.assertEqual(readalllines(s, True, 10), sexpected)
146 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000147
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200148 lineends = ("\n", "\r\n", "\r", "\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000149 # Test long lines (multiple calls to read() in readline())
150 vw = []
151 vwo = []
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200152 for (i, lineend) in enumerate(lineends):
153 vw.append((i*200+200)*"\u3042" + lineend)
154 vwo.append((i*200+200)*"\u3042")
155 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
156 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000157
158 # Test lines where the first read might end with \r, so the
159 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000160 for size in range(80):
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200161 for lineend in lineends:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000164 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 self.assertEqual(
166 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000167 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000168 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200169 self.assertEqual(
170 reader.readline(keepends=True),
171 "xxx\n",
172 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000173 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000174 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000175 self.assertEqual(
176 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000177 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000178 )
Serhiy Storchaka5b4fab12014-02-06 09:26:56 +0200179 self.assertEqual(
180 reader.readline(keepends=False),
181 "xxx",
182 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000183
Serhiy Storchaka80038502014-01-26 19:21:00 +0200184 def test_mixed_readline_and_read(self):
185 lines = ["Humpty Dumpty sat on a wall,\n",
186 "Humpty Dumpty had a great fall.\r\n",
187 "All the king's horses and all the king's men\r",
188 "Couldn't put Humpty together again."]
189 data = ''.join(lines)
190 def getreader():
191 stream = io.BytesIO(data.encode(self.encoding))
192 return codecs.getreader(self.encoding)(stream)
193
194 # Issue #8260: Test readline() followed by read()
195 f = getreader()
196 self.assertEqual(f.readline(), lines[0])
197 self.assertEqual(f.read(), ''.join(lines[1:]))
198 self.assertEqual(f.read(), '')
199
200 # Issue #16636: Test readline() followed by readlines()
201 f = getreader()
202 self.assertEqual(f.readline(), lines[0])
203 self.assertEqual(f.readlines(), lines[1:])
204 self.assertEqual(f.read(), '')
205
206 # Test read() followed by read()
207 f = getreader()
208 self.assertEqual(f.read(size=40, chars=5), data[:5])
209 self.assertEqual(f.read(), data[5:])
210 self.assertEqual(f.read(), '')
211
212 # Issue #12446: Test read() followed by readlines()
213 f = getreader()
214 self.assertEqual(f.read(size=40, chars=5), data[:5])
215 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
216 self.assertEqual(f.read(), '')
217
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000218 def test_bug1175396(self):
219 s = [
220 '<%!--===================================================\r\n',
221 ' BLOG index page: show recent articles,\r\n',
222 ' today\'s articles, or articles of a specific date.\r\n',
223 '========================================================--%>\r\n',
224 '<%@inputencoding="ISO-8859-1"%>\r\n',
225 '<%@pagetemplate=TEMPLATE.y%>\r\n',
226 '<%@import=import frog.util, frog%>\r\n',
227 '<%@import=import frog.objects%>\r\n',
228 '<%@import=from frog.storageerrors import StorageError%>\r\n',
229 '<%\r\n',
230 '\r\n',
231 'import logging\r\n',
232 'log=logging.getLogger("Snakelets.logger")\r\n',
233 '\r\n',
234 '\r\n',
235 'user=self.SessionCtx.user\r\n',
236 'storageEngine=self.SessionCtx.storageEngine\r\n',
237 '\r\n',
238 '\r\n',
239 'def readArticlesFromDate(date, count=None):\r\n',
240 ' entryids=storageEngine.listBlogEntries(date)\r\n',
241 ' entryids.reverse() # descending\r\n',
242 ' if count:\r\n',
243 ' entryids=entryids[:count]\r\n',
244 ' try:\r\n',
245 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
246 ' except StorageError,x:\r\n',
247 ' log.error("Error loading articles: "+str(x))\r\n',
248 ' self.abort("cannot load articles")\r\n',
249 '\r\n',
250 'showdate=None\r\n',
251 '\r\n',
252 'arg=self.Request.getArg()\r\n',
253 'if arg=="today":\r\n',
254 ' #-------------------- TODAY\'S ARTICLES\r\n',
255 ' self.write("<h2>Today\'s articles</h2>")\r\n',
256 ' showdate = frog.util.isodatestr() \r\n',
257 ' entries = readArticlesFromDate(showdate)\r\n',
258 'elif arg=="active":\r\n',
259 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
260 ' self.Yredirect("active.y")\r\n',
261 'elif arg=="login":\r\n',
262 ' #-------------------- LOGIN PAGE redirect\r\n',
263 ' self.Yredirect("login.y")\r\n',
264 'elif arg=="date":\r\n',
265 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
266 ' showdate = self.Request.getParameter("date")\r\n',
267 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
268 ' entries = readArticlesFromDate(showdate)\r\n',
269 'else:\r\n',
270 ' #-------------------- RECENT ARTICLES\r\n',
271 ' self.write("<h2>Recent articles</h2>")\r\n',
272 ' dates=storageEngine.listBlogEntryDates()\r\n',
273 ' if dates:\r\n',
274 ' entries=[]\r\n',
275 ' SHOWAMOUNT=10\r\n',
276 ' for showdate in dates:\r\n',
277 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
278 ' if len(entries)>=SHOWAMOUNT:\r\n',
279 ' break\r\n',
280 ' \r\n',
281 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000282 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200283 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000284 for (i, line) in enumerate(reader):
285 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000286
287 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000288 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200289 writer = codecs.getwriter(self.encoding)(q)
290 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000291
292 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000293 writer.write("foo\r")
294 self.assertEqual(reader.readline(keepends=False), "foo")
295 writer.write("\nbar\r")
296 self.assertEqual(reader.readline(keepends=False), "")
297 self.assertEqual(reader.readline(keepends=False), "bar")
298 writer.write("baz")
299 self.assertEqual(reader.readline(keepends=False), "baz")
300 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000301
302 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000303 writer.write("foo\r")
304 self.assertEqual(reader.readline(keepends=True), "foo\r")
305 writer.write("\nbar\r")
306 self.assertEqual(reader.readline(keepends=True), "\n")
307 self.assertEqual(reader.readline(keepends=True), "bar\r")
308 writer.write("baz")
309 self.assertEqual(reader.readline(keepends=True), "baz")
310 self.assertEqual(reader.readline(keepends=True), "")
311 writer.write("foo\r\n")
312 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000313
Walter Dörwald9fa09462005-01-10 12:01:39 +0000314 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000315 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
316 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
317 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000318
319 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000320 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200321 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000322 self.assertEqual(reader.readline(), s1)
323 self.assertEqual(reader.readline(), s2)
324 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000325 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000326
327 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000328 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
329 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
330 s3 = "stillokay:bbbbxx\r\n"
331 s4 = "broken!!!!badbad\r\n"
332 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000333
334 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000335 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200336 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000337 self.assertEqual(reader.readline(), s1)
338 self.assertEqual(reader.readline(), s2)
339 self.assertEqual(reader.readline(), s3)
340 self.assertEqual(reader.readline(), s4)
341 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000342 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000343
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200344class UTF32Test(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000345 encoding = "utf-32"
346
347 spamle = (b'\xff\xfe\x00\x00'
348 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
349 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
350 spambe = (b'\x00\x00\xfe\xff'
351 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
352 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
353
354 def test_only_one_bom(self):
355 _,_,reader,writer = codecs.lookup(self.encoding)
356 # encode some stream
357 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200358 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000359 f.write("spam")
360 f.write("spam")
361 d = s.getvalue()
362 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000363 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 # try to read it back
365 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200366 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000367 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000368
369 def test_badbom(self):
370 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200371 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000372 self.assertRaises(UnicodeError, f.read)
373
374 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200375 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000376 self.assertRaises(UnicodeError, f.read)
377
378 def test_partial(self):
379 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200380 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000381 [
382 "", # first byte of BOM read
383 "", # second byte of BOM read
384 "", # third byte of BOM read
385 "", # fourth byte of BOM read => byteorder known
386 "",
387 "",
388 "",
389 "\x00",
390 "\x00",
391 "\x00",
392 "\x00",
393 "\x00\xff",
394 "\x00\xff",
395 "\x00\xff",
396 "\x00\xff",
397 "\x00\xff\u0100",
398 "\x00\xff\u0100",
399 "\x00\xff\u0100",
400 "\x00\xff\u0100",
401 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200402 "\x00\xff\u0100\uffff",
403 "\x00\xff\u0100\uffff",
404 "\x00\xff\u0100\uffff",
405 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000406 ]
407 )
408
Georg Brandl791f4e12009-09-17 11:41:24 +0000409 def test_handlers(self):
410 self.assertEqual(('\ufffd', 1),
411 codecs.utf_32_decode(b'\x01', 'replace', True))
412 self.assertEqual(('', 1),
413 codecs.utf_32_decode(b'\x01', 'ignore', True))
414
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415 def test_errors(self):
416 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
417 b"\xff", "strict", True)
418
419 def test_decoder_state(self):
420 self.check_state_handling_decode(self.encoding,
421 "spamspam", self.spamle)
422 self.check_state_handling_decode(self.encoding,
423 "spamspam", self.spambe)
424
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000425 def test_issue8941(self):
426 # Issue #8941: insufficient result allocation when decoding into
427 # surrogate pairs on UCS-2 builds.
428 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
429 self.assertEqual('\U00010000' * 1024,
430 codecs.utf_32_decode(encoded_le)[0])
431 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
432 self.assertEqual('\U00010000' * 1024,
433 codecs.utf_32_decode(encoded_be)[0])
434
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200435class UTF32LETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000436 encoding = "utf-32-le"
437
438 def test_partial(self):
439 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200440 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000441 [
442 "",
443 "",
444 "",
445 "\x00",
446 "\x00",
447 "\x00",
448 "\x00",
449 "\x00\xff",
450 "\x00\xff",
451 "\x00\xff",
452 "\x00\xff",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100",
455 "\x00\xff\u0100",
456 "\x00\xff\u0100",
457 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200458 "\x00\xff\u0100\uffff",
459 "\x00\xff\u0100\uffff",
460 "\x00\xff\u0100\uffff",
461 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000462 ]
463 )
464
465 def test_simple(self):
466 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
467
468 def test_errors(self):
469 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
470 b"\xff", "strict", True)
471
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000472 def test_issue8941(self):
473 # Issue #8941: insufficient result allocation when decoding into
474 # surrogate pairs on UCS-2 builds.
475 encoded = b'\x00\x00\x01\x00' * 1024
476 self.assertEqual('\U00010000' * 1024,
477 codecs.utf_32_le_decode(encoded)[0])
478
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200479class UTF32BETest(ReadTest, unittest.TestCase):
Walter Dörwald41980ca2007-08-16 21:55:45 +0000480 encoding = "utf-32-be"
481
482 def test_partial(self):
483 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200484 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000485 [
486 "",
487 "",
488 "",
489 "\x00",
490 "\x00",
491 "\x00",
492 "\x00",
493 "\x00\xff",
494 "\x00\xff",
495 "\x00\xff",
496 "\x00\xff",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100",
500 "\x00\xff\u0100",
501 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200502 "\x00\xff\u0100\uffff",
503 "\x00\xff\u0100\uffff",
504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000506 ]
507 )
508
509 def test_simple(self):
510 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
511
512 def test_errors(self):
513 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
514 b"\xff", "strict", True)
515
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000516 def test_issue8941(self):
517 # Issue #8941: insufficient result allocation when decoding into
518 # surrogate pairs on UCS-2 builds.
519 encoded = b'\x00\x01\x00\x00' * 1024
520 self.assertEqual('\U00010000' * 1024,
521 codecs.utf_32_be_decode(encoded)[0])
522
523
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200524class UTF16Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000526
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000527 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
528 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000529
530 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000531 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000532 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000533 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200534 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000535 f.write("spam")
536 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000537 d = s.getvalue()
538 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000539 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000540 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000541 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200542 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000543 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000544
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000545 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000546 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200547 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000548 self.assertRaises(UnicodeError, f.read)
549
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000550 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200551 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000552 self.assertRaises(UnicodeError, f.read)
553
Walter Dörwald69652032004-09-07 20:24:22 +0000554 def test_partial(self):
555 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200556 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000557 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000558 "", # first byte of BOM read
559 "", # second byte of BOM read => byteorder known
560 "",
561 "\x00",
562 "\x00",
563 "\x00\xff",
564 "\x00\xff",
565 "\x00\xff\u0100",
566 "\x00\xff\u0100",
567 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200568 "\x00\xff\u0100\uffff",
569 "\x00\xff\u0100\uffff",
570 "\x00\xff\u0100\uffff",
571 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000572 ]
573 )
574
Georg Brandl791f4e12009-09-17 11:41:24 +0000575 def test_handlers(self):
576 self.assertEqual(('\ufffd', 1),
577 codecs.utf_16_decode(b'\x01', 'replace', True))
578 self.assertEqual(('', 1),
579 codecs.utf_16_decode(b'\x01', 'ignore', True))
580
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000582 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000583 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000584
585 def test_decoder_state(self):
586 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000588 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000589 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000590
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000591 def test_bug691291(self):
592 # Files are always opened in binary mode, even if no binary mode was
593 # specified. This means that no automatic conversion of '\n' is done
594 # on reading and writing.
595 s1 = 'Hello\r\nworld\r\n'
596
597 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200598 self.addCleanup(support.unlink, support.TESTFN)
599 with open(support.TESTFN, 'wb') as fp:
600 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200601 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200602 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000603
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200604class UTF16LETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000605 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000606
607 def test_partial(self):
608 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200609 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000610 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000611 "",
612 "\x00",
613 "\x00",
614 "\x00\xff",
615 "\x00\xff",
616 "\x00\xff\u0100",
617 "\x00\xff\u0100",
618 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200619 "\x00\xff\u0100\uffff",
620 "\x00\xff\u0100\uffff",
621 "\x00\xff\u0100\uffff",
622 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000623 ]
624 )
625
Walter Dörwalde22d3392005-11-17 08:52:34 +0000626 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200627 tests = [
628 (b'\xff', '\ufffd'),
629 (b'A\x00Z', 'A\ufffd'),
630 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
631 (b'\x00\xd8', '\ufffd'),
632 (b'\x00\xd8A', '\ufffd'),
633 (b'\x00\xd8A\x00', '\ufffdA'),
634 (b'\x00\xdcA\x00', '\ufffdA'),
635 ]
636 for raw, expected in tests:
637 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
638 raw, 'strict', True)
639 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000640
Victor Stinner53a9dd72010-12-08 22:25:45 +0000641 def test_nonbmp(self):
642 self.assertEqual("\U00010203".encode(self.encoding),
643 b'\x00\xd8\x03\xde')
644 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
645 "\U00010203")
646
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200647class UTF16BETest(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000648 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000649
650 def test_partial(self):
651 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200652 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000653 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000654 "",
655 "\x00",
656 "\x00",
657 "\x00\xff",
658 "\x00\xff",
659 "\x00\xff\u0100",
660 "\x00\xff\u0100",
661 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200662 "\x00\xff\u0100\uffff",
663 "\x00\xff\u0100\uffff",
664 "\x00\xff\u0100\uffff",
665 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000666 ]
667 )
668
Walter Dörwalde22d3392005-11-17 08:52:34 +0000669 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200670 tests = [
671 (b'\xff', '\ufffd'),
672 (b'\x00A\xff', 'A\ufffd'),
673 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
674 (b'\xd8\x00', '\ufffd'),
675 (b'\xd8\x00\xdc', '\ufffd'),
676 (b'\xd8\x00\x00A', '\ufffdA'),
677 (b'\xdc\x00\x00A', '\ufffdA'),
678 ]
679 for raw, expected in tests:
680 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
681 raw, 'strict', True)
682 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000683
Victor Stinner53a9dd72010-12-08 22:25:45 +0000684 def test_nonbmp(self):
685 self.assertEqual("\U00010203".encode(self.encoding),
686 b'\xd8\x00\xde\x03')
687 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
688 "\U00010203")
689
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200690class UTF8Test(ReadTest, unittest.TestCase):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000691 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000692
693 def test_partial(self):
694 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200695 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000696 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000697 "\x00",
698 "\x00",
699 "\x00\xff",
700 "\x00\xff",
701 "\x00\xff\u07ff",
702 "\x00\xff\u07ff",
703 "\x00\xff\u07ff",
704 "\x00\xff\u07ff\u0800",
705 "\x00\xff\u07ff\u0800",
706 "\x00\xff\u07ff\u0800",
707 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200708 "\x00\xff\u07ff\u0800\uffff",
709 "\x00\xff\u07ff\u0800\uffff",
710 "\x00\xff\u07ff\u0800\uffff",
711 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000712 ]
713 )
714
Walter Dörwald3abcb012007-04-16 22:10:50 +0000715 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000716 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000717 self.check_state_handling_decode(self.encoding,
718 u, u.encode(self.encoding))
719
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000720 def test_lone_surrogates(self):
721 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
722 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000723 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
724 b'[\\udc80]')
725 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
726 b'[&#56448;]')
727 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
728 b'[\x80]')
729 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
730 b'[]')
731 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
732 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000733
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000734 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000735 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
736 b"abc\xed\xa0\x80def")
737 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
738 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200739 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
740 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
741 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
742 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000743 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700744 with self.assertRaises(UnicodeDecodeError):
745 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200746 with self.assertRaises(UnicodeDecodeError):
747 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000748
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200749@unittest.skipUnless(sys.platform == 'win32',
750 'cp65001 is a Windows-only codec')
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200751class CP65001Test(ReadTest, unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200752 encoding = "cp65001"
753
754 def test_encode(self):
755 tests = [
756 ('abc', 'strict', b'abc'),
757 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
758 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
759 ]
760 if VISTA_OR_LATER:
761 tests.extend((
762 ('\udc80', 'strict', None),
763 ('\udc80', 'ignore', b''),
764 ('\udc80', 'replace', b'?'),
765 ('\udc80', 'backslashreplace', b'\\udc80'),
766 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
767 ))
768 else:
769 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
770 for text, errors, expected in tests:
771 if expected is not None:
772 try:
773 encoded = text.encode('cp65001', errors)
774 except UnicodeEncodeError as err:
775 self.fail('Unable to encode %a to cp65001 with '
776 'errors=%r: %s' % (text, errors, err))
777 self.assertEqual(encoded, expected,
778 '%a.encode("cp65001", %r)=%a != %a'
779 % (text, errors, encoded, expected))
780 else:
781 self.assertRaises(UnicodeEncodeError,
782 text.encode, "cp65001", errors)
783
784 def test_decode(self):
785 tests = [
786 (b'abc', 'strict', 'abc'),
787 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
788 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
789 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
790 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
791 # invalid bytes
792 (b'[\xff]', 'strict', None),
793 (b'[\xff]', 'ignore', '[]'),
794 (b'[\xff]', 'replace', '[\ufffd]'),
795 (b'[\xff]', 'surrogateescape', '[\udcff]'),
796 ]
797 if VISTA_OR_LATER:
798 tests.extend((
799 (b'[\xed\xb2\x80]', 'strict', None),
800 (b'[\xed\xb2\x80]', 'ignore', '[]'),
801 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
802 ))
803 else:
804 tests.extend((
805 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
806 ))
807 for raw, errors, expected in tests:
808 if expected is not None:
809 try:
810 decoded = raw.decode('cp65001', errors)
811 except UnicodeDecodeError as err:
812 self.fail('Unable to decode %a from cp65001 with '
813 'errors=%r: %s' % (raw, errors, err))
814 self.assertEqual(decoded, expected,
815 '%a.decode("cp65001", %r)=%a != %a'
816 % (raw, errors, decoded, expected))
817 else:
818 self.assertRaises(UnicodeDecodeError,
819 raw.decode, 'cp65001', errors)
820
821 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
822 def test_lone_surrogates(self):
823 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
824 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
825 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
826 b'[\\udc80]')
827 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
828 b'[&#56448;]')
829 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
830 b'[\x80]')
831 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
832 b'[]')
833 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
834 b'[?]')
835
836 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
837 def test_surrogatepass_handler(self):
838 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
839 b"abc\xed\xa0\x80def")
840 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
841 "abc\ud800def")
842 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
843 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
844 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
845 "\U00010fff\uD800")
846 self.assertTrue(codecs.lookup_error("surrogatepass"))
847
848
849
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200850class UTF7Test(ReadTest, unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000851 encoding = "utf-7"
852
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000853 def test_partial(self):
854 self.check_partial(
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200855 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000856 [
Serhiy Storchaka016a3f32014-02-08 14:01:29 +0200857 'a',
858 'a',
859 'a+',
860 'a+-',
861 'a+-b',
862 'a+-b',
863 'a+-b',
864 'a+-b',
865 'a+-b',
866 'a+-b\x00',
867 'a+-b\x00c',
868 'a+-b\x00c',
869 'a+-b\x00c',
870 'a+-b\x00c',
871 'a+-b\x00c',
872 'a+-b\x00c\x80',
873 'a+-b\x00c\x80d',
874 'a+-b\x00c\x80d',
875 'a+-b\x00c\x80d',
876 'a+-b\x00c\x80d',
877 'a+-b\x00c\x80d',
878 'a+-b\x00c\x80d\u0100',
879 'a+-b\x00c\x80d\u0100e',
880 'a+-b\x00c\x80d\u0100e',
881 'a+-b\x00c\x80d\u0100e',
882 'a+-b\x00c\x80d\u0100e',
883 'a+-b\x00c\x80d\u0100e',
884 'a+-b\x00c\x80d\u0100e',
885 'a+-b\x00c\x80d\u0100e',
886 'a+-b\x00c\x80d\u0100e',
887 'a+-b\x00c\x80d\u0100e\U00010000',
888 'a+-b\x00c\x80d\u0100e\U00010000f',
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000889 ]
890 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000891
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300892 def test_errors(self):
893 tests = [
894 (b'a\xffb', 'a\ufffdb'),
895 (b'a+IK', 'a\ufffd'),
896 (b'a+IK-b', 'a\ufffdb'),
897 (b'a+IK,b', 'a\ufffdb'),
898 (b'a+IKx', 'a\u20ac\ufffd'),
899 (b'a+IKx-b', 'a\u20ac\ufffdb'),
900 (b'a+IKwgr', 'a\u20ac\ufffd'),
901 (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
902 (b'a+IKwgr,', 'a\u20ac\ufffd'),
903 (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
904 (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
905 (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
906 (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
907 (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
908 (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
909 (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
910 ]
911 for raw, expected in tests:
Serhiy Storchaka0e071c92013-10-19 21:14:57 +0300912 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
913 raw, 'strict', True)
914 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchaka35804e42013-10-19 20:38:19 +0300915
916 def test_nonbmp(self):
917 self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
918 self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
919 self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
920
Walter Dörwalde22d3392005-11-17 08:52:34 +0000921class UTF16ExTest(unittest.TestCase):
922
923 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000924 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000925
926 def test_bad_args(self):
927 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
928
929class ReadBufferTest(unittest.TestCase):
930
931 def test_array(self):
932 import array
933 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000934 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000935 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000936 )
937
938 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000939 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000940
941 def test_bad_args(self):
942 self.assertRaises(TypeError, codecs.readbuffer_encode)
943 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
944
Ezio Melotti5d3dba02013-01-11 06:02:07 +0200945class UTF8SigTest(ReadTest, unittest.TestCase):
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000946 encoding = "utf-8-sig"
947
948 def test_partial(self):
949 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200950 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000951 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 "",
953 "",
954 "", # First BOM has been read and skipped
955 "",
956 "",
957 "\ufeff", # Second BOM has been read and emitted
958 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000959 "\ufeff\x00", # First byte of encoded "\xff" read
960 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
961 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
962 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000963 "\ufeff\x00\xff\u07ff",
964 "\ufeff\x00\xff\u07ff",
965 "\ufeff\x00\xff\u07ff\u0800",
966 "\ufeff\x00\xff\u07ff\u0800",
967 "\ufeff\x00\xff\u07ff\u0800",
968 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200969 "\ufeff\x00\xff\u07ff\u0800\uffff",
970 "\ufeff\x00\xff\u07ff\u0800\uffff",
971 "\ufeff\x00\xff\u07ff\u0800\uffff",
972 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000973 ]
974 )
975
Thomas Wouters89f507f2006-12-13 04:49:30 +0000976 def test_bug1601501(self):
977 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000978 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000979
Walter Dörwald3abcb012007-04-16 22:10:50 +0000980 def test_bom(self):
981 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000982 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000983 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
984
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000985 def test_stream_bom(self):
986 unistring = "ABC\u00A1\u2200XYZ"
987 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
988
989 reader = codecs.getreader("utf-8-sig")
990 for sizehint in [None] + list(range(1, 11)) + \
991 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200992 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000993 ostream = io.StringIO()
994 while 1:
995 if sizehint is not None:
996 data = istream.read(sizehint)
997 else:
998 data = istream.read()
999
1000 if not data:
1001 break
1002 ostream.write(data)
1003
1004 got = ostream.getvalue()
1005 self.assertEqual(got, unistring)
1006
1007 def test_stream_bare(self):
1008 unistring = "ABC\u00A1\u2200XYZ"
1009 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1010
1011 reader = codecs.getreader("utf-8-sig")
1012 for sizehint in [None] + list(range(1, 11)) + \
1013 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +02001014 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +00001015 ostream = io.StringIO()
1016 while 1:
1017 if sizehint is not None:
1018 data = istream.read(sizehint)
1019 else:
1020 data = istream.read()
1021
1022 if not data:
1023 break
1024 ostream.write(data)
1025
1026 got = ostream.getvalue()
1027 self.assertEqual(got, unistring)
1028
1029class EscapeDecodeTest(unittest.TestCase):
1030 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001031 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +00001032
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001033 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001034 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001035 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001036 b = bytes([b])
1037 if b != b'\\':
1038 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001039
1040 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001041 decode = codecs.escape_decode
1042 check = coding_checker(self, decode)
1043 check(b"[\\\n]", b"[]")
1044 check(br'[\"]', b'["]')
1045 check(br"[\']", b"[']")
1046 check(br"[\\]", br"[\]")
1047 check(br"[\a]", b"[\x07]")
1048 check(br"[\b]", b"[\x08]")
1049 check(br"[\t]", b"[\x09]")
1050 check(br"[\n]", b"[\x0a]")
1051 check(br"[\v]", b"[\x0b]")
1052 check(br"[\f]", b"[\x0c]")
1053 check(br"[\r]", b"[\x0d]")
1054 check(br"[\7]", b"[\x07]")
1055 check(br"[\8]", br"[\8]")
1056 check(br"[\78]", b"[\x078]")
1057 check(br"[\41]", b"[!]")
1058 check(br"[\418]", b"[!8]")
1059 check(br"[\101]", b"[A]")
1060 check(br"[\1010]", b"[A0]")
1061 check(br"[\501]", b"[A]")
1062 check(br"[\x41]", b"[A]")
1063 check(br"[\X41]", br"[\X41]")
1064 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001065 for b in range(256):
1066 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001067 b = bytes([b])
1068 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001069
1070 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +02001071 decode = codecs.escape_decode
1072 self.assertRaises(ValueError, decode, br"\x")
1073 self.assertRaises(ValueError, decode, br"[\x]")
1074 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1075 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1076 self.assertRaises(ValueError, decode, br"\x0")
1077 self.assertRaises(ValueError, decode, br"[\x0]")
1078 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1079 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +02001080
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001081class RecodingTest(unittest.TestCase):
1082 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001083 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +02001084 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001085 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +00001086 f2.close()
1087 # Python used to crash on this at exit because of a refcount
1088 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +00001089
Martin v. Löwis2548c732003-04-18 10:39:54 +00001090# From RFC 3492
1091punycode_testcases = [
1092 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001093 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1094 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001095 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001096 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001097 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001098 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001099 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001100 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001101 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001102 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001103 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1104 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1105 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001106 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001107 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001108 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1109 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1110 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001111 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001112 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001113 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001114 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1115 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1116 "\u0939\u0948\u0902",
1117 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118
1119 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001120 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +00001121 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1122 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123
1124 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001125 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1126 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1127 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001128 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1129 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001130
1131 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001132 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1133 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1134 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1135 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001136 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137
1138 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001139 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1140 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1141 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1142 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1143 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001144 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145
1146 # (K) Vietnamese:
1147 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1148 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001149 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1150 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1151 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1152 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001153 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154
Martin v. Löwis2548c732003-04-18 10:39:54 +00001155 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001157 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001158
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1161 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1162 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001163 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164
1165 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001166 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1167 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1168 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001169 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170
1171 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001172 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001173 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001174
1175 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001176 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1177 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001178 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001179
1180 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001181 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001182 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183
1184 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001185 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001186 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187
1188 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001189 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1190 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001191 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001192 ]
1193
1194for i in punycode_testcases:
1195 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001196 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197
1198class PunycodeTest(unittest.TestCase):
1199 def test_encode(self):
1200 for uni, puny in punycode_testcases:
1201 # Need to convert both strings to lower case, since
1202 # some of the extended encodings use upper case, but our
1203 # code produces only lower case. Converting just puny to
1204 # lower is also insufficient, since some of the input characters
1205 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001206 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001207 str(uni.encode("punycode"), "ascii").lower(),
1208 str(puny, "ascii").lower()
1209 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210
1211 def test_decode(self):
1212 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001213 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001214 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001215 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001217class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001218 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001219 def test_bug1251300(self):
1220 # Decoding with unicode_internal used to not correctly handle "code
1221 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001222 ok = [
1223 (b"\x00\x10\xff\xff", "\U0010ffff"),
1224 (b"\x00\x00\x01\x01", "\U00000101"),
1225 (b"", ""),
1226 ]
1227 not_ok = [
1228 b"\x7f\xff\xff\xff",
1229 b"\x80\x00\x00\x00",
1230 b"\x81\x00\x00\x00",
1231 b"\x00",
1232 b"\x00\x00\x00\x00\x00",
1233 ]
1234 for internal, uni in ok:
1235 if sys.byteorder == "little":
1236 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001237 with support.check_warnings():
1238 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001239 for internal in not_ok:
1240 if sys.byteorder == "little":
1241 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001242 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001243 'deprecated', DeprecationWarning)):
1244 self.assertRaises(UnicodeDecodeError, internal.decode,
1245 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001246 if sys.byteorder == "little":
1247 invalid = b"\x00\x00\x11\x00"
1248 else:
1249 invalid = b"\x00\x11\x00\x00"
1250 with support.check_warnings():
1251 self.assertRaises(UnicodeDecodeError,
1252 invalid.decode, "unicode_internal")
1253 with support.check_warnings():
1254 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1255 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001256
Victor Stinner182d90d2011-09-29 19:53:55 +02001257 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001258 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001259 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001260 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001261 'deprecated', DeprecationWarning)):
1262 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001263 except UnicodeDecodeError as ex:
1264 self.assertEqual("unicode_internal", ex.encoding)
1265 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1266 self.assertEqual(4, ex.start)
1267 self.assertEqual(8, ex.end)
1268 else:
1269 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001270
Victor Stinner182d90d2011-09-29 19:53:55 +02001271 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001272 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001273 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1274 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001275 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001276 'deprecated', DeprecationWarning)):
1277 ab = "ab".encode("unicode_internal").decode()
1278 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1279 "ascii"),
1280 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001281 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001282
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001283 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001284 with support.check_warnings(('unicode_internal codec has been '
1285 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001286 # Issue 3739
1287 encoder = codecs.getencoder("unicode_internal")
1288 self.assertEqual(encoder("a")[1], 1)
1289 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1290
1291 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001292
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1294nameprep_tests = [
1295 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001296 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1297 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1298 b'\xb8\x8f\xef\xbb\xbf',
1299 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001300 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001301 (b'CAFE',
1302 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001303 # 3.3 Case folding 8bit U+00DF (german sharp s).
1304 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001305 (b'\xc3\x9f',
1306 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001307 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001308 (b'\xc4\xb0',
1309 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001311 (b'\xc5\x83\xcd\xba',
1312 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001313 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1314 # XXX: skip this as it fails in UCS-2 mode
1315 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1316 # 'telc\xe2\x88\x95kg\xcf\x83'),
1317 (None, None),
1318 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1320 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001321 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001322 (b'\xe1\xbe\xb7',
1323 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001324 # 3.9 Self-reverting case folding U+01F0 and normalization.
1325 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001326 (b'\xc7\xb0',
1327 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001328 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001329 (b'\xce\x90',
1330 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001331 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001332 (b'\xce\xb0',
1333 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001334 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001335 (b'\xe1\xba\x96',
1336 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001337 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001338 (b'\xe1\xbd\x96',
1339 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001340 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001341 (b' ',
1342 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001343 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001344 (b'\xc2\xa0',
1345 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001346 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001347 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001348 None),
1349 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001350 (b'\xe2\x80\x80',
1351 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001352 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001353 (b'\xe2\x80\x8b',
1354 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001355 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001356 (b'\xe3\x80\x80',
1357 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001358 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001359 (b'\x10\x7f',
1360 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001361 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001362 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001363 None),
1364 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001366 None),
1367 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001368 (b'\xef\xbb\xbf',
1369 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001370 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001371 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001372 None),
1373 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001374 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001375 None),
1376 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001378 None),
1379 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001380 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001381 None),
1382 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001383 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001384 None),
1385 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001386 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001387 None),
1388 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001389 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001390 None),
1391 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001392 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001393 None),
1394 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001395 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001396 None),
1397 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001398 (b'\xcd\x81',
1399 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001400 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001401 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001402 None),
1403 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001404 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001405 None),
1406 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001407 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001408 None),
1409 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001410 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001411 None),
1412 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001413 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001414 None),
1415 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001416 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001417 None),
1418 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001419 (b'foo\xef\xb9\xb6bar',
1420 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001421 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001422 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001423 None),
1424 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001425 (b'\xd8\xa71\xd8\xa8',
1426 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001427 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001428 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001429 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001430 # None),
1431 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001432 # 3.44 Larger test (shrinking).
1433 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001434 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1435 b'\xaa\xce\xb0\xe2\x80\x80',
1436 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001437 # 3.45 Larger test (expanding).
1438 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001439 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1440 b'\x80',
1441 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1442 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1443 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001444 ]
1445
1446
1447class NameprepTest(unittest.TestCase):
1448 def test_nameprep(self):
1449 from encodings.idna import nameprep
1450 for pos, (orig, prepped) in enumerate(nameprep_tests):
1451 if orig is None:
1452 # Skipped
1453 continue
1454 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001455 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001456 if prepped is None:
1457 # Input contains prohibited characters
1458 self.assertRaises(UnicodeError, nameprep, orig)
1459 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001460 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001461 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001462 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001463 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001464 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001466class IDNACodecTest(unittest.TestCase):
1467 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001468 self.assertEqual(str(b"python.org", "idna"), "python.org")
1469 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1470 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1471 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001472
1473 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001474 self.assertEqual("python.org".encode("idna"), b"python.org")
1475 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1476 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1477 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001478
Martin v. Löwis8b595142005-08-25 11:03:38 +00001479 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001480 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001481 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001482 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001483
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001484 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001485 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001486 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001487 "python.org"
1488 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001489 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001490 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001491 "python.org."
1492 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001493 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001494 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001495 "pyth\xf6n.org."
1496 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001497 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001498 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001499 "pyth\xf6n.org."
1500 )
1501
1502 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001503 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1504 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1505 self.assertEqual(decoder.decode(b"rg"), "")
1506 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001507
1508 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001509 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1510 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1511 self.assertEqual(decoder.decode(b"rg."), "org.")
1512 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001513
1514 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001515 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001516 b"".join(codecs.iterencode("python.org", "idna")),
1517 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001518 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001519 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001520 b"".join(codecs.iterencode("python.org.", "idna")),
1521 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001522 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001523 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001524 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1525 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001526 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001527 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001528 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1529 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001530 )
1531
1532 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001533 self.assertEqual(encoder.encode("\xe4x"), b"")
1534 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1535 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001536
1537 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001538 self.assertEqual(encoder.encode("\xe4x"), b"")
1539 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1540 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001541
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001542class CodecsModuleTest(unittest.TestCase):
1543
1544 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001545 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1546 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001547 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001548 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001549 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001550
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001551 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001552 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1553 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001554 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001555 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001556 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001557 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001558
1559 def test_register(self):
1560 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001561 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001562
1563 def test_lookup(self):
1564 self.assertRaises(TypeError, codecs.lookup)
1565 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001566 self.assertRaises(LookupError, codecs.lookup, " ")
1567
1568 def test_getencoder(self):
1569 self.assertRaises(TypeError, codecs.getencoder)
1570 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1571
1572 def test_getdecoder(self):
1573 self.assertRaises(TypeError, codecs.getdecoder)
1574 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1575
1576 def test_getreader(self):
1577 self.assertRaises(TypeError, codecs.getreader)
1578 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1579
1580 def test_getwriter(self):
1581 self.assertRaises(TypeError, codecs.getwriter)
1582 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001583
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001584 def test_lookup_issue1813(self):
1585 # Issue #1813: under Turkish locales, lookup of some codecs failed
1586 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001587 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001588 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1589 try:
1590 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1591 except locale.Error:
1592 # Unsupported locale on this system
1593 self.skipTest('test needs Turkish locale')
1594 c = codecs.lookup('ASCII')
1595 self.assertEqual(c.name, 'ascii')
1596
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001597class StreamReaderTest(unittest.TestCase):
1598
1599 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001600 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001601 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001602
1603 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001604 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001605 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001606
Thomas Wouters89f507f2006-12-13 04:49:30 +00001607class EncodedFileTest(unittest.TestCase):
1608
1609 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001610 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001611 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001612 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001613
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001614 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001615 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001616 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001617 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001618
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001619all_unicode_encodings = [
1620 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001621 "big5",
1622 "big5hkscs",
1623 "charmap",
1624 "cp037",
1625 "cp1006",
1626 "cp1026",
1627 "cp1140",
1628 "cp1250",
1629 "cp1251",
1630 "cp1252",
1631 "cp1253",
1632 "cp1254",
1633 "cp1255",
1634 "cp1256",
1635 "cp1257",
1636 "cp1258",
1637 "cp424",
1638 "cp437",
1639 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001640 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001641 "cp737",
1642 "cp775",
1643 "cp850",
1644 "cp852",
1645 "cp855",
1646 "cp856",
1647 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001648 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001649 "cp860",
1650 "cp861",
1651 "cp862",
1652 "cp863",
1653 "cp864",
1654 "cp865",
1655 "cp866",
1656 "cp869",
1657 "cp874",
1658 "cp875",
1659 "cp932",
1660 "cp949",
1661 "cp950",
1662 "euc_jis_2004",
1663 "euc_jisx0213",
1664 "euc_jp",
1665 "euc_kr",
1666 "gb18030",
1667 "gb2312",
1668 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001669 "hp_roman8",
1670 "hz",
1671 "idna",
1672 "iso2022_jp",
1673 "iso2022_jp_1",
1674 "iso2022_jp_2",
1675 "iso2022_jp_2004",
1676 "iso2022_jp_3",
1677 "iso2022_jp_ext",
1678 "iso2022_kr",
1679 "iso8859_1",
1680 "iso8859_10",
1681 "iso8859_11",
1682 "iso8859_13",
1683 "iso8859_14",
1684 "iso8859_15",
1685 "iso8859_16",
1686 "iso8859_2",
1687 "iso8859_3",
1688 "iso8859_4",
1689 "iso8859_5",
1690 "iso8859_6",
1691 "iso8859_7",
1692 "iso8859_8",
1693 "iso8859_9",
1694 "johab",
1695 "koi8_r",
1696 "koi8_u",
1697 "latin_1",
1698 "mac_cyrillic",
1699 "mac_greek",
1700 "mac_iceland",
1701 "mac_latin2",
1702 "mac_roman",
1703 "mac_turkish",
1704 "palmos",
1705 "ptcp154",
1706 "punycode",
1707 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001708 "shift_jis",
1709 "shift_jis_2004",
1710 "shift_jisx0213",
1711 "tis_620",
1712 "unicode_escape",
1713 "unicode_internal",
1714 "utf_16",
1715 "utf_16_be",
1716 "utf_16_le",
1717 "utf_7",
1718 "utf_8",
1719]
1720
1721if hasattr(codecs, "mbcs_encode"):
1722 all_unicode_encodings.append("mbcs")
1723
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001724# The following encoding is not tested, because it's not supposed
1725# to work:
1726# "undefined"
1727
1728# The following encodings don't work in stateful mode
1729broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001730 "punycode",
1731 "unicode_internal"
1732]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001733broken_incremental_coders = broken_unicode_with_streams + [
1734 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001735]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001736
Walter Dörwald3abcb012007-04-16 22:10:50 +00001737class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001738 def test_basics(self):
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001739 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001740 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001741 name = codecs.lookup(encoding).name
1742 if encoding.endswith("_codec"):
1743 name += "_codec"
1744 elif encoding == "latin_1":
1745 name = "latin_1"
1746 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001747
Ezio Melottiadc417c2011-11-17 12:23:34 +02001748 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001749 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001750 (b, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001751 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Victor Stinner040e16e2011-11-15 22:44:05 +01001752 (chars, size) = codecs.getdecoder(encoding)(b)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001753 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001754
1755 if encoding not in broken_unicode_with_streams:
1756 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001757 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001758 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001759 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001760 for c in s:
1761 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001762 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001763 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001764 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001765 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001766 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001767 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001768 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001769 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001770 decodedresult += reader.read()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001771 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001772
Thomas Wouters89f507f2006-12-13 04:49:30 +00001773 if encoding not in broken_incremental_coders:
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001774 # check incremental decoder/encoder and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001775 try:
1776 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001777 except LookupError: # no IncrementalEncoder
Thomas Woutersa9773292006-04-21 09:43:23 +00001778 pass
1779 else:
1780 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001781 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001782 for c in s:
1783 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001784 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001785 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001786 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001787 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001788 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001789 decodedresult += decoder.decode(b"", True)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001790 self.assertEqual(decodedresult, s,
1791 "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001792
1793 # check iterencode()/iterdecode()
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001794 result = "".join(codecs.iterdecode(
1795 codecs.iterencode(s, encoding), encoding))
1796 self.assertEqual(result, s, "encoding=%r" % encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001797
1798 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001799 result = "".join(codecs.iterdecode(
1800 codecs.iterencode("", encoding), encoding))
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001801 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001802
Victor Stinner554f3f02010-06-16 23:33:54 +00001803 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001804 # check incremental decoder/encoder with errors argument
1805 try:
1806 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001807 except LookupError: # no IncrementalEncoder
Thomas Wouters89f507f2006-12-13 04:49:30 +00001808 pass
1809 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001810 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001811 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001812 decodedresult = "".join(decoder.decode(bytes([c]))
1813 for c in encodedresult)
1814 self.assertEqual(decodedresult, s,
1815 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001816
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001817 @support.cpython_only
1818 def test_basics_capi(self):
1819 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1820 s = "abc123" # all codecs should be able to encode these
1821 for encoding in all_unicode_encodings:
1822 if encoding not in broken_incremental_coders:
1823 # check incremental decoder/encoder (fetched via the C API)
1824 try:
1825 cencoder = codec_incrementalencoder(encoding)
1826 except LookupError: # no IncrementalEncoder
1827 pass
1828 else:
1829 # check C API
1830 encodedresult = b""
1831 for c in s:
1832 encodedresult += cencoder.encode(c)
1833 encodedresult += cencoder.encode("", True)
1834 cdecoder = codec_incrementaldecoder(encoding)
1835 decodedresult = ""
1836 for c in encodedresult:
1837 decodedresult += cdecoder.decode(bytes([c]))
1838 decodedresult += cdecoder.decode(b"", True)
1839 self.assertEqual(decodedresult, s,
1840 "encoding=%r" % encoding)
1841
1842 if encoding not in ("idna", "mbcs"):
1843 # check incremental decoder/encoder with errors argument
1844 try:
1845 cencoder = codec_incrementalencoder(encoding, "ignore")
1846 except LookupError: # no IncrementalEncoder
1847 pass
1848 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001849 encodedresult = b"".join(cencoder.encode(c) for c in s)
Serhiy Storchaka5cfc79d2014-02-07 10:06:39 +02001850 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1851 decodedresult = "".join(cdecoder.decode(bytes([c]))
1852 for c in encodedresult)
1853 self.assertEqual(decodedresult, s,
1854 "encoding=%r" % encoding)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001855
Walter Dörwald729c31f2005-03-14 19:06:30 +00001856 def test_seek(self):
1857 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001858 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001859 for encoding in all_unicode_encodings:
1860 if encoding == "idna": # FIXME: See SF bug #1163178
1861 continue
1862 if encoding in broken_unicode_with_streams:
1863 continue
Victor Stinner05010702011-05-27 16:50:40 +02001864 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001865 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001866 # Test that calling seek resets the internal codec state and buffers
1867 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001868 data = reader.read()
1869 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001870
Walter Dörwalde22d3392005-11-17 08:52:34 +00001871 def test_bad_decode_args(self):
1872 for encoding in all_unicode_encodings:
1873 decoder = codecs.getdecoder(encoding)
1874 self.assertRaises(TypeError, decoder)
1875 if encoding not in ("idna", "punycode"):
1876 self.assertRaises(TypeError, decoder, 42)
1877
1878 def test_bad_encode_args(self):
1879 for encoding in all_unicode_encodings:
1880 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001881 with support.check_warnings():
1882 # unicode-internal has been deprecated
1883 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001884
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001885 def test_encoding_map_type_initialized(self):
1886 from encodings import cp1140
1887 # This used to crash, we are only verifying there's no crash.
1888 table_type = type(cp1140.encoding_table)
1889 self.assertEqual(table_type, table_type)
1890
Walter Dörwald3abcb012007-04-16 22:10:50 +00001891 def test_decoder_state(self):
1892 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001893 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001894 for encoding in all_unicode_encodings:
1895 if encoding not in broken_incremental_coders:
1896 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1897 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1898
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001899class CharmapTest(unittest.TestCase):
1900 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001901 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001902 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001903 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001904 )
1905
Ezio Melottib3aedd42010-11-20 19:04:17 +00001906 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001907 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1908 ("\U0010FFFFbc", 3)
1909 )
1910
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001911 self.assertRaises(UnicodeDecodeError,
1912 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1913 )
1914
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001915 self.assertRaises(UnicodeDecodeError,
1916 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1917 )
1918
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001919 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001920 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001921 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001922 )
1923
Ezio Melottib3aedd42010-11-20 19:04:17 +00001924 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001925 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001926 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001927 )
1928
Ezio Melottib3aedd42010-11-20 19:04:17 +00001929 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001930 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001931 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001932 )
1933
Ezio Melottib3aedd42010-11-20 19:04:17 +00001934 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001935 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001936 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001937 )
1938
Guido van Rossum805365e2007-05-07 22:24:25 +00001939 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001940 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001941 codecs.charmap_decode(allbytes, "ignore", ""),
1942 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001943 )
1944
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001945 def test_decode_with_int2str_map(self):
1946 self.assertEqual(
1947 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1948 {0: 'a', 1: 'b', 2: 'c'}),
1949 ("abc", 3)
1950 )
1951
1952 self.assertEqual(
1953 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1954 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1955 ("AaBbCc", 3)
1956 )
1957
1958 self.assertEqual(
1959 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1960 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1961 ("\U0010FFFFbc", 3)
1962 )
1963
1964 self.assertEqual(
1965 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1966 {0: 'a', 1: 'b', 2: ''}),
1967 ("ab", 3)
1968 )
1969
1970 self.assertRaises(UnicodeDecodeError,
1971 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1972 {0: 'a', 1: 'b'}
1973 )
1974
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001975 self.assertRaises(UnicodeDecodeError,
1976 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1977 {0: 'a', 1: 'b', 2: None}
1978 )
1979
1980 # Issue #14850
1981 self.assertRaises(UnicodeDecodeError,
1982 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1983 {0: 'a', 1: 'b', 2: '\ufffe'}
1984 )
1985
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001986 self.assertEqual(
1987 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1988 {0: 'a', 1: 'b'}),
1989 ("ab\ufffd", 3)
1990 )
1991
1992 self.assertEqual(
1993 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1994 {0: 'a', 1: 'b', 2: None}),
1995 ("ab\ufffd", 3)
1996 )
1997
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001998 # Issue #14850
1999 self.assertEqual(
2000 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2001 {0: 'a', 1: 'b', 2: '\ufffe'}),
2002 ("ab\ufffd", 3)
2003 )
2004
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002005 self.assertEqual(
2006 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2007 {0: 'a', 1: 'b'}),
2008 ("ab", 3)
2009 )
2010
2011 self.assertEqual(
2012 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2013 {0: 'a', 1: 'b', 2: None}),
2014 ("ab", 3)
2015 )
2016
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002017 # Issue #14850
2018 self.assertEqual(
2019 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2020 {0: 'a', 1: 'b', 2: '\ufffe'}),
2021 ("ab", 3)
2022 )
2023
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002024 allbytes = bytes(range(256))
2025 self.assertEqual(
2026 codecs.charmap_decode(allbytes, "ignore", {}),
2027 ("", len(allbytes))
2028 )
2029
2030 def test_decode_with_int2int_map(self):
2031 a = ord('a')
2032 b = ord('b')
2033 c = ord('c')
2034
2035 self.assertEqual(
2036 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2037 {0: a, 1: b, 2: c}),
2038 ("abc", 3)
2039 )
2040
2041 # Issue #15379
2042 self.assertEqual(
2043 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2044 {0: 0x10FFFF, 1: b, 2: c}),
2045 ("\U0010FFFFbc", 3)
2046 )
2047
Antoine Pitroua1f76552012-09-23 20:00:04 +02002048 self.assertEqual(
2049 codecs.charmap_decode(b"\x00\x01\x02", "strict",
2050 {0: sys.maxunicode, 1: b, 2: c}),
2051 (chr(sys.maxunicode) + "bc", 3)
2052 )
2053
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002054 self.assertRaises(TypeError,
2055 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02002056 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002057 )
2058
2059 self.assertRaises(UnicodeDecodeError,
2060 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2061 {0: a, 1: b},
2062 )
2063
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002064 self.assertRaises(UnicodeDecodeError,
2065 codecs.charmap_decode, b"\x00\x01\x02", "strict",
2066 {0: a, 1: b, 2: 0xFFFE},
2067 )
2068
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002069 self.assertEqual(
2070 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2071 {0: a, 1: b}),
2072 ("ab\ufffd", 3)
2073 )
2074
2075 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002076 codecs.charmap_decode(b"\x00\x01\x02", "replace",
2077 {0: a, 1: b, 2: 0xFFFE}),
2078 ("ab\ufffd", 3)
2079 )
2080
2081 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002082 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2083 {0: a, 1: b}),
2084 ("ab", 3)
2085 )
2086
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02002087 self.assertEqual(
2088 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2089 {0: a, 1: b, 2: 0xFFFE}),
2090 ("ab", 3)
2091 )
2092
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02002093
Thomas Wouters89f507f2006-12-13 04:49:30 +00002094class WithStmtTest(unittest.TestCase):
2095 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002096 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02002097 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2098 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002099
2100 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00002101 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002102 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02002103 with codecs.StreamReaderWriter(f, info.streamreader,
2104 info.streamwriter, 'strict') as srw:
2105 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00002106
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002107class TypesTest(unittest.TestCase):
2108 def test_decode_unicode(self):
2109 # Most decoders don't accept unicode input
2110 decoders = [
2111 codecs.utf_7_decode,
2112 codecs.utf_8_decode,
2113 codecs.utf_16_le_decode,
2114 codecs.utf_16_be_decode,
2115 codecs.utf_16_ex_decode,
2116 codecs.utf_32_decode,
2117 codecs.utf_32_le_decode,
2118 codecs.utf_32_be_decode,
2119 codecs.utf_32_ex_decode,
2120 codecs.latin_1_decode,
2121 codecs.ascii_decode,
2122 codecs.charmap_decode,
2123 ]
2124 if hasattr(codecs, "mbcs_decode"):
2125 decoders.append(codecs.mbcs_decode)
2126 for decoder in decoders:
2127 self.assertRaises(TypeError, decoder, "xxx")
2128
2129 def test_unicode_escape(self):
2130 # Escape-decoding an unicode string is supported ang gives the same
2131 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00002132 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2133 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2134 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2135 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002136
Victor Stinnere3b47152011-12-09 20:49:49 +01002137 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2138 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2139
2140 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2141 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2142
Serhiy Storchakad6793772013-01-29 10:20:44 +02002143
2144class UnicodeEscapeTest(unittest.TestCase):
2145 def test_empty(self):
2146 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2147 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2148
2149 def test_raw_encode(self):
2150 encode = codecs.unicode_escape_encode
2151 for b in range(32, 127):
2152 if b != b'\\'[0]:
2153 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2154
2155 def test_raw_decode(self):
2156 decode = codecs.unicode_escape_decode
2157 for b in range(256):
2158 if b != b'\\'[0]:
2159 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2160
2161 def test_escape_encode(self):
2162 encode = codecs.unicode_escape_encode
2163 check = coding_checker(self, encode)
2164 check('\t', br'\t')
2165 check('\n', br'\n')
2166 check('\r', br'\r')
2167 check('\\', br'\\')
2168 for b in range(32):
2169 if chr(b) not in '\t\n\r':
2170 check(chr(b), ('\\x%02x' % b).encode())
2171 for b in range(127, 256):
2172 check(chr(b), ('\\x%02x' % b).encode())
2173 check('\u20ac', br'\u20ac')
2174 check('\U0001d120', br'\U0001d120')
2175
2176 def test_escape_decode(self):
2177 decode = codecs.unicode_escape_decode
2178 check = coding_checker(self, decode)
2179 check(b"[\\\n]", "[]")
2180 check(br'[\"]', '["]')
2181 check(br"[\']", "[']")
2182 check(br"[\\]", r"[\]")
2183 check(br"[\a]", "[\x07]")
2184 check(br"[\b]", "[\x08]")
2185 check(br"[\t]", "[\x09]")
2186 check(br"[\n]", "[\x0a]")
2187 check(br"[\v]", "[\x0b]")
2188 check(br"[\f]", "[\x0c]")
2189 check(br"[\r]", "[\x0d]")
2190 check(br"[\7]", "[\x07]")
2191 check(br"[\8]", r"[\8]")
2192 check(br"[\78]", "[\x078]")
2193 check(br"[\41]", "[!]")
2194 check(br"[\418]", "[!8]")
2195 check(br"[\101]", "[A]")
2196 check(br"[\1010]", "[A0]")
2197 check(br"[\x41]", "[A]")
2198 check(br"[\x410]", "[A0]")
2199 check(br"\u20ac", "\u20ac")
2200 check(br"\U0001d120", "\U0001d120")
2201 for b in range(256):
2202 if b not in b'\n"\'\\abtnvfr01234567xuUN':
2203 check(b'\\' + bytes([b]), '\\' + chr(b))
2204
2205 def test_decode_errors(self):
2206 decode = codecs.unicode_escape_decode
2207 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2208 for i in range(d):
2209 self.assertRaises(UnicodeDecodeError, decode,
2210 b"\\" + c + b"0"*i)
2211 self.assertRaises(UnicodeDecodeError, decode,
2212 b"[\\" + c + b"0"*i + b"]")
2213 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2214 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2215 self.assertEqual(decode(data, "replace"),
2216 ("[\ufffd]\ufffd", len(data)))
2217 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2218 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2219 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2220
2221
Serhiy Storchakac9c43382013-01-29 11:40:00 +02002222class RawUnicodeEscapeTest(unittest.TestCase):
2223 def test_empty(self):
2224 self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2225 self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2226
2227 def test_raw_encode(self):
2228 encode = codecs.raw_unicode_escape_encode
2229 for b in range(256):
2230 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2231
2232 def test_raw_decode(self):
2233 decode = codecs.raw_unicode_escape_decode
2234 for b in range(256):
2235 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2236
2237 def test_escape_encode(self):
2238 encode = codecs.raw_unicode_escape_encode
2239 check = coding_checker(self, encode)
2240 for b in range(256):
2241 if b not in b'uU':
2242 check('\\' + chr(b), b'\\' + bytes([b]))
2243 check('\u20ac', br'\u20ac')
2244 check('\U0001d120', br'\U0001d120')
2245
2246 def test_escape_decode(self):
2247 decode = codecs.raw_unicode_escape_decode
2248 check = coding_checker(self, decode)
2249 for b in range(256):
2250 if b not in b'uU':
2251 check(b'\\' + bytes([b]), '\\' + chr(b))
2252 check(br"\u20ac", "\u20ac")
2253 check(br"\U0001d120", "\U0001d120")
2254
2255 def test_decode_errors(self):
2256 decode = codecs.raw_unicode_escape_decode
2257 for c, d in (b'u', 4), (b'U', 4):
2258 for i in range(d):
2259 self.assertRaises(UnicodeDecodeError, decode,
2260 b"\\" + c + b"0"*i)
2261 self.assertRaises(UnicodeDecodeError, decode,
2262 b"[\\" + c + b"0"*i + b"]")
2263 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2264 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2265 self.assertEqual(decode(data, "replace"),
2266 ("[\ufffd]\ufffd", len(data)))
2267 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2268 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2269 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2270
2271
Martin v. Löwis43c57782009-05-10 08:15:24 +00002272class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00002273
2274 def test_utf8(self):
2275 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002276 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002277 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002278 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002279 b"foo\x80bar")
2280 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00002281 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002282 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002283 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002284 b"\xed\xb0\x80")
2285
2286 def test_ascii(self):
2287 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00002288 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002289 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002290 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002291 b"foo\x80bar")
2292
2293 def test_charmap(self):
2294 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00002295 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002296 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00002297 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00002298 b"foo\xa5bar")
2299
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002300 def test_latin1(self):
2301 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00002302 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00002303 b"\xe4\xeb\xef\xf6\xfc")
2304
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002305
Victor Stinner3fed0872010-05-22 02:16:27 +00002306class BomTest(unittest.TestCase):
2307 def test_seek0(self):
2308 data = "1234567890"
2309 tests = ("utf-16",
2310 "utf-16-le",
2311 "utf-16-be",
2312 "utf-32",
2313 "utf-32-le",
2314 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02002315 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00002316 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002317 # Check if the BOM is written only once
2318 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00002319 f.write(data)
2320 f.write(data)
2321 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002322 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002323 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002324 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00002325
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002326 # Check that the BOM is written after a seek(0)
2327 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2328 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002329 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002330 f.seek(0)
2331 f.write(data)
2332 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002333 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002334
2335 # (StreamWriter) Check that the BOM is written after a seek(0)
2336 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002337 f.writer.write(data[0])
2338 self.assertNotEqual(f.writer.tell(), 0)
2339 f.writer.seek(0)
2340 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002341 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002342 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002343
Victor Stinner05010702011-05-27 16:50:40 +02002344 # Check that the BOM is not written after a seek() at a position
2345 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002346 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2347 f.write(data)
2348 f.seek(f.tell())
2349 f.write(data)
2350 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002351 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002352
Victor Stinner05010702011-05-27 16:50:40 +02002353 # (StreamWriter) Check that the BOM is not written after a seek()
2354 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002355 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002356 f.writer.write(data)
2357 f.writer.seek(f.writer.tell())
2358 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002359 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002360 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002361
Victor Stinner3fed0872010-05-22 02:16:27 +00002362
Georg Brandl02524622010-12-02 18:06:51 +00002363bytes_transform_encodings = [
2364 "base64_codec",
2365 "uu_codec",
2366 "quopri_codec",
2367 "hex_codec",
2368]
2369try:
2370 import zlib
2371except ImportError:
2372 pass
2373else:
2374 bytes_transform_encodings.append("zlib_codec")
2375try:
2376 import bz2
2377except ImportError:
2378 pass
2379else:
2380 bytes_transform_encodings.append("bz2_codec")
2381
2382class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002383
Georg Brandl02524622010-12-02 18:06:51 +00002384 def test_basics(self):
2385 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002386 for encoding in bytes_transform_encodings:
2387 # generic codecs interface
2388 (o, size) = codecs.getencoder(encoding)(binput)
2389 self.assertEqual(size, len(binput))
2390 (i, size) = codecs.getdecoder(encoding)(o)
2391 self.assertEqual(size, len(o))
2392 self.assertEqual(i, binput)
2393
Georg Brandl02524622010-12-02 18:06:51 +00002394 def test_read(self):
2395 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002396 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002397 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002398 sout = reader.read()
2399 self.assertEqual(sout, b"\x80")
2400
2401 def test_readline(self):
2402 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002403 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002404 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002405 sout = reader.readline()
2406 self.assertEqual(sout, b"\x80")
2407
2408
Victor Stinner62be4fb2011-10-18 21:46:37 +02002409@unittest.skipUnless(sys.platform == 'win32',
2410 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002411class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002412 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002413 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002414
Victor Stinner3a50e702011-10-18 21:21:00 +02002415 def test_invalid_code_page(self):
2416 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2417 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2418 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2419 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2420
2421 def test_code_page_name(self):
2422 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2423 codecs.code_page_encode, 932, '\xff')
2424 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2425 codecs.code_page_decode, 932, b'\x81\x00')
2426 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2427 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2428
2429 def check_decode(self, cp, tests):
2430 for raw, errors, expected in tests:
2431 if expected is not None:
2432 try:
2433 decoded = codecs.code_page_decode(cp, raw, errors)
2434 except UnicodeDecodeError as err:
2435 self.fail('Unable to decode %a from "cp%s" with '
2436 'errors=%r: %s' % (raw, cp, errors, err))
2437 self.assertEqual(decoded[0], expected,
2438 '%a.decode("cp%s", %r)=%a != %a'
2439 % (raw, cp, errors, decoded[0], expected))
2440 # assert 0 <= decoded[1] <= len(raw)
2441 self.assertGreaterEqual(decoded[1], 0)
2442 self.assertLessEqual(decoded[1], len(raw))
2443 else:
2444 self.assertRaises(UnicodeDecodeError,
2445 codecs.code_page_decode, cp, raw, errors)
2446
2447 def check_encode(self, cp, tests):
2448 for text, errors, expected in tests:
2449 if expected is not None:
2450 try:
2451 encoded = codecs.code_page_encode(cp, text, errors)
2452 except UnicodeEncodeError as err:
2453 self.fail('Unable to encode %a to "cp%s" with '
2454 'errors=%r: %s' % (text, cp, errors, err))
2455 self.assertEqual(encoded[0], expected,
2456 '%a.encode("cp%s", %r)=%a != %a'
2457 % (text, cp, errors, encoded[0], expected))
2458 self.assertEqual(encoded[1], len(text))
2459 else:
2460 self.assertRaises(UnicodeEncodeError,
2461 codecs.code_page_encode, cp, text, errors)
2462
2463 def test_cp932(self):
2464 self.check_encode(932, (
2465 ('abc', 'strict', b'abc'),
2466 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002467 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002468 ('\xff', 'strict', None),
2469 ('[\xff]', 'ignore', b'[]'),
2470 ('[\xff]', 'replace', b'[y]'),
2471 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002472 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2473 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002474 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002475 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002476 (b'abc', 'strict', 'abc'),
2477 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2478 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002479 (b'[\xff]', 'strict', None),
2480 (b'[\xff]', 'ignore', '[]'),
2481 (b'[\xff]', 'replace', '[\ufffd]'),
2482 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002483 (b'\x81\x00abc', 'strict', None),
2484 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002485 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2486 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002487
2488 def test_cp1252(self):
2489 self.check_encode(1252, (
2490 ('abc', 'strict', b'abc'),
2491 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2492 ('\xff', 'strict', b'\xff'),
2493 ('\u0141', 'strict', None),
2494 ('\u0141', 'ignore', b''),
2495 ('\u0141', 'replace', b'L'),
2496 ))
2497 self.check_decode(1252, (
2498 (b'abc', 'strict', 'abc'),
2499 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2500 (b'\xff', 'strict', '\xff'),
2501 ))
2502
2503 def test_cp_utf7(self):
2504 cp = 65000
2505 self.check_encode(cp, (
2506 ('abc', 'strict', b'abc'),
2507 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2508 ('\U0010ffff', 'strict', b'+2//f/w-'),
2509 ('\udc80', 'strict', b'+3IA-'),
2510 ('\ufffd', 'strict', b'+//0-'),
2511 ))
2512 self.check_decode(cp, (
2513 (b'abc', 'strict', 'abc'),
2514 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2515 (b'+2//f/w-', 'strict', '\U0010ffff'),
2516 (b'+3IA-', 'strict', '\udc80'),
2517 (b'+//0-', 'strict', '\ufffd'),
2518 # invalid bytes
2519 (b'[+/]', 'strict', '[]'),
2520 (b'[\xff]', 'strict', '[\xff]'),
2521 ))
2522
Victor Stinner3a50e702011-10-18 21:21:00 +02002523 def test_multibyte_encoding(self):
2524 self.check_decode(932, (
2525 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2526 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2527 ))
2528 self.check_decode(self.CP_UTF8, (
2529 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2530 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2531 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002532 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002533 self.check_encode(self.CP_UTF8, (
2534 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2535 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2536 ))
2537
2538 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002539 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2540 self.assertEqual(decoded, ('', 0))
2541
Victor Stinner3a50e702011-10-18 21:21:00 +02002542 decoded = codecs.code_page_decode(932,
2543 b'\xe9\x80\xe9', 'strict',
2544 False)
2545 self.assertEqual(decoded, ('\u9a3e', 2))
2546
2547 decoded = codecs.code_page_decode(932,
2548 b'\xe9\x80\xe9\x80', 'strict',
2549 False)
2550 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2551
2552 decoded = codecs.code_page_decode(932,
2553 b'abc', 'strict',
2554 False)
2555 self.assertEqual(decoded, ('abc', 3))
2556
2557
Fred Drake2e2be372001-09-20 21:33:42 +00002558if __name__ == "__main__":
Ezio Melotti5d3dba02013-01-11 06:02:07 +02002559 unittest.main()