blob: fa257b814c837b8452ae2b79227f77f0e79f1e67 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Victor Stinner98fe1a02011-05-27 01:51:18 +02002import unittest
Victor Stinner05010702011-05-27 16:50:40 +02003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner05010702011-05-27 16:50:40 +02005import sys, _testcapi, io
Victor Stinner182d90d2011-09-29 19:53:55 +02006
Antoine Pitrou00b2c862011-10-05 13:01:41 +02007try:
8 import ctypes
9except ImportError:
10 ctypes = None
11 SIZEOF_WCHAR_T = -1
12else:
13 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000014
Walter Dörwald69652032004-09-07 20:24:22 +000015class Queue(object):
16 """
17 queue: write bytes at one end, read bytes from the other end
18 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000019 def __init__(self, buffer):
20 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000021
22 def write(self, chars):
23 self._buffer += chars
24
25 def read(self, size=-1):
26 if size<0:
27 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000029 return s
30 else:
31 s = self._buffer[:size]
32 self._buffer = self._buffer[size:]
33 return s
34
Walter Dörwald3abcb012007-04-16 22:10:50 +000035class MixInCheckStateHandling:
36 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000037 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000038 d = codecs.getincrementaldecoder(encoding)()
39 part1 = d.decode(s[:i])
40 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000041 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000042 # Check that the condition stated in the documentation for
43 # IncrementalDecoder.getstate() holds
44 if not state[1]:
45 # reset decoder to the default state without anything buffered
46 d.setstate((state[0][:0], 0))
47 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000048 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000049 # The decoder must return to the same state
50 self.assertEqual(state, d.getstate())
51 # Create a new decoder and set it to the state
52 # we extracted from the old one
53 d = codecs.getincrementaldecoder(encoding)()
54 d.setstate(state)
55 part2 = d.decode(s[i:], True)
56 self.assertEqual(u, part1+part2)
57
58 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000059 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000060 d = codecs.getincrementalencoder(encoding)()
61 part1 = d.encode(u[:i])
62 state = d.getstate()
63 d = codecs.getincrementalencoder(encoding)()
64 d.setstate(state)
65 part2 = d.encode(u[i:], True)
66 self.assertEqual(s, part1+part2)
67
68class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000070 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000071 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000072 # the StreamReader and check that the results equal the appropriate
73 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000074 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020075 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000076 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000078 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000079 result += r.read()
80 self.assertEqual(result, partialresult)
81 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000082 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000084
Thomas Woutersa9773292006-04-21 09:43:23 +000085 # do the check again, this time using a incremental decoder
86 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000087 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000088 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000090 self.assertEqual(result, partialresult)
91 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(d.decode(b"", True), "")
93 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000094
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000095 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000096 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000097 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000098 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 self.assertEqual(result, partialresult)
101 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000102 self.assertEqual(d.decode(b"", True), "")
103 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000104
105 # check iterdecode()
106 encoded = input.encode(self.encoding)
107 self.assertEqual(
108 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000109 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000110 )
111
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000112 def test_readline(self):
113 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000114 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 return codecs.getreader(self.encoding)(stream)
116
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200118 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000119 lines = []
120 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000121 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000122 if not line:
123 break
124 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000125 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000127 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
128 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
129 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 self.assertEqual(readalllines(s, True), sexpected)
131 self.assertEqual(readalllines(s, False), sexpectednoends)
132 self.assertEqual(readalllines(s, True, 10), sexpected)
133 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000134
135 # Test long lines (multiple calls to read() in readline())
136 vw = []
137 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000138 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
139 vw.append((i*200)*"\3042" + lineend)
140 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000141 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
142 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
143
144 # Test lines where the first read might end with \r, so the
145 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000146 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for lineend in "\n \r\n \r \u2028".split():
148 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000150 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 self.assertEqual(
152 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000153 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000154 )
155 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000156 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000157 self.assertEqual(
158 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000159 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 )
161
162 def test_bug1175396(self):
163 s = [
164 '<%!--===================================================\r\n',
165 ' BLOG index page: show recent articles,\r\n',
166 ' today\'s articles, or articles of a specific date.\r\n',
167 '========================================================--%>\r\n',
168 '<%@inputencoding="ISO-8859-1"%>\r\n',
169 '<%@pagetemplate=TEMPLATE.y%>\r\n',
170 '<%@import=import frog.util, frog%>\r\n',
171 '<%@import=import frog.objects%>\r\n',
172 '<%@import=from frog.storageerrors import StorageError%>\r\n',
173 '<%\r\n',
174 '\r\n',
175 'import logging\r\n',
176 'log=logging.getLogger("Snakelets.logger")\r\n',
177 '\r\n',
178 '\r\n',
179 'user=self.SessionCtx.user\r\n',
180 'storageEngine=self.SessionCtx.storageEngine\r\n',
181 '\r\n',
182 '\r\n',
183 'def readArticlesFromDate(date, count=None):\r\n',
184 ' entryids=storageEngine.listBlogEntries(date)\r\n',
185 ' entryids.reverse() # descending\r\n',
186 ' if count:\r\n',
187 ' entryids=entryids[:count]\r\n',
188 ' try:\r\n',
189 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
190 ' except StorageError,x:\r\n',
191 ' log.error("Error loading articles: "+str(x))\r\n',
192 ' self.abort("cannot load articles")\r\n',
193 '\r\n',
194 'showdate=None\r\n',
195 '\r\n',
196 'arg=self.Request.getArg()\r\n',
197 'if arg=="today":\r\n',
198 ' #-------------------- TODAY\'S ARTICLES\r\n',
199 ' self.write("<h2>Today\'s articles</h2>")\r\n',
200 ' showdate = frog.util.isodatestr() \r\n',
201 ' entries = readArticlesFromDate(showdate)\r\n',
202 'elif arg=="active":\r\n',
203 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
204 ' self.Yredirect("active.y")\r\n',
205 'elif arg=="login":\r\n',
206 ' #-------------------- LOGIN PAGE redirect\r\n',
207 ' self.Yredirect("login.y")\r\n',
208 'elif arg=="date":\r\n',
209 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
210 ' showdate = self.Request.getParameter("date")\r\n',
211 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
212 ' entries = readArticlesFromDate(showdate)\r\n',
213 'else:\r\n',
214 ' #-------------------- RECENT ARTICLES\r\n',
215 ' self.write("<h2>Recent articles</h2>")\r\n',
216 ' dates=storageEngine.listBlogEntryDates()\r\n',
217 ' if dates:\r\n',
218 ' entries=[]\r\n',
219 ' SHOWAMOUNT=10\r\n',
220 ' for showdate in dates:\r\n',
221 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
222 ' if len(entries)>=SHOWAMOUNT:\r\n',
223 ' break\r\n',
224 ' \r\n',
225 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000226 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200227 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000228 for (i, line) in enumerate(reader):
229 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000230
231 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000232 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200233 writer = codecs.getwriter(self.encoding)(q)
234 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000235
236 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000237 writer.write("foo\r")
238 self.assertEqual(reader.readline(keepends=False), "foo")
239 writer.write("\nbar\r")
240 self.assertEqual(reader.readline(keepends=False), "")
241 self.assertEqual(reader.readline(keepends=False), "bar")
242 writer.write("baz")
243 self.assertEqual(reader.readline(keepends=False), "baz")
244 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000245
246 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000247 writer.write("foo\r")
248 self.assertEqual(reader.readline(keepends=True), "foo\r")
249 writer.write("\nbar\r")
250 self.assertEqual(reader.readline(keepends=True), "\n")
251 self.assertEqual(reader.readline(keepends=True), "bar\r")
252 writer.write("baz")
253 self.assertEqual(reader.readline(keepends=True), "baz")
254 self.assertEqual(reader.readline(keepends=True), "")
255 writer.write("foo\r\n")
256 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000257
Walter Dörwald9fa09462005-01-10 12:01:39 +0000258 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000259 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
260 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
261 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000264 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200265 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 self.assertEqual(reader.readline(), s1)
267 self.assertEqual(reader.readline(), s2)
268 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000269 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000270
271 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000272 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
273 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
274 s3 = "stillokay:bbbbxx\r\n"
275 s4 = "broken!!!!badbad\r\n"
276 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000277
278 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000279 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200280 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000281 self.assertEqual(reader.readline(), s1)
282 self.assertEqual(reader.readline(), s2)
283 self.assertEqual(reader.readline(), s3)
284 self.assertEqual(reader.readline(), s4)
285 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000286 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000287
Walter Dörwald41980ca2007-08-16 21:55:45 +0000288class UTF32Test(ReadTest):
289 encoding = "utf-32"
290
291 spamle = (b'\xff\xfe\x00\x00'
292 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
293 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
294 spambe = (b'\x00\x00\xfe\xff'
295 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
296 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
297
298 def test_only_one_bom(self):
299 _,_,reader,writer = codecs.lookup(self.encoding)
300 # encode some stream
301 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200302 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303 f.write("spam")
304 f.write("spam")
305 d = s.getvalue()
306 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000307 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000308 # try to read it back
309 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200310 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000311 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312
313 def test_badbom(self):
314 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200315 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000316 self.assertRaises(UnicodeError, f.read)
317
318 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000320 self.assertRaises(UnicodeError, f.read)
321
322 def test_partial(self):
323 self.check_partial(
324 "\x00\xff\u0100\uffff",
325 [
326 "", # first byte of BOM read
327 "", # second byte of BOM read
328 "", # third byte of BOM read
329 "", # fourth byte of BOM read => byteorder known
330 "",
331 "",
332 "",
333 "\x00",
334 "\x00",
335 "\x00",
336 "\x00",
337 "\x00\xff",
338 "\x00\xff",
339 "\x00\xff",
340 "\x00\xff",
341 "\x00\xff\u0100",
342 "\x00\xff\u0100",
343 "\x00\xff\u0100",
344 "\x00\xff\u0100",
345 "\x00\xff\u0100\uffff",
346 ]
347 )
348
Georg Brandl791f4e12009-09-17 11:41:24 +0000349 def test_handlers(self):
350 self.assertEqual(('\ufffd', 1),
351 codecs.utf_32_decode(b'\x01', 'replace', True))
352 self.assertEqual(('', 1),
353 codecs.utf_32_decode(b'\x01', 'ignore', True))
354
Walter Dörwald41980ca2007-08-16 21:55:45 +0000355 def test_errors(self):
356 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
357 b"\xff", "strict", True)
358
359 def test_decoder_state(self):
360 self.check_state_handling_decode(self.encoding,
361 "spamspam", self.spamle)
362 self.check_state_handling_decode(self.encoding,
363 "spamspam", self.spambe)
364
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000365 def test_issue8941(self):
366 # Issue #8941: insufficient result allocation when decoding into
367 # surrogate pairs on UCS-2 builds.
368 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
369 self.assertEqual('\U00010000' * 1024,
370 codecs.utf_32_decode(encoded_le)[0])
371 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
372 self.assertEqual('\U00010000' * 1024,
373 codecs.utf_32_decode(encoded_be)[0])
374
Walter Dörwald41980ca2007-08-16 21:55:45 +0000375class UTF32LETest(ReadTest):
376 encoding = "utf-32-le"
377
378 def test_partial(self):
379 self.check_partial(
380 "\x00\xff\u0100\uffff",
381 [
382 "",
383 "",
384 "",
385 "\x00",
386 "\x00",
387 "\x00",
388 "\x00",
389 "\x00\xff",
390 "\x00\xff",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff\u0100",
394 "\x00\xff\u0100",
395 "\x00\xff\u0100",
396 "\x00\xff\u0100",
397 "\x00\xff\u0100\uffff",
398 ]
399 )
400
401 def test_simple(self):
402 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
403
404 def test_errors(self):
405 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
406 b"\xff", "strict", True)
407
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000408 def test_issue8941(self):
409 # Issue #8941: insufficient result allocation when decoding into
410 # surrogate pairs on UCS-2 builds.
411 encoded = b'\x00\x00\x01\x00' * 1024
412 self.assertEqual('\U00010000' * 1024,
413 codecs.utf_32_le_decode(encoded)[0])
414
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415class UTF32BETest(ReadTest):
416 encoding = "utf-32-be"
417
418 def test_partial(self):
419 self.check_partial(
420 "\x00\xff\u0100\uffff",
421 [
422 "",
423 "",
424 "",
425 "\x00",
426 "\x00",
427 "\x00",
428 "\x00",
429 "\x00\xff",
430 "\x00\xff",
431 "\x00\xff",
432 "\x00\xff",
433 "\x00\xff\u0100",
434 "\x00\xff\u0100",
435 "\x00\xff\u0100",
436 "\x00\xff\u0100",
437 "\x00\xff\u0100\uffff",
438 ]
439 )
440
441 def test_simple(self):
442 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
443
444 def test_errors(self):
445 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
446 b"\xff", "strict", True)
447
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000448 def test_issue8941(self):
449 # Issue #8941: insufficient result allocation when decoding into
450 # surrogate pairs on UCS-2 builds.
451 encoded = b'\x00\x01\x00\x00' * 1024
452 self.assertEqual('\U00010000' * 1024,
453 codecs.utf_32_be_decode(encoded)[0])
454
455
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000456class UTF16Test(ReadTest):
457 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000458
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000459 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
460 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000461
462 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000463 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000465 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200466 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000467 f.write("spam")
468 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000469 d = s.getvalue()
470 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000471 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200474 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000475 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000476
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000477 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000478 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200479 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000480 self.assertRaises(UnicodeError, f.read)
481
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200483 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000484 self.assertRaises(UnicodeError, f.read)
485
Walter Dörwald69652032004-09-07 20:24:22 +0000486 def test_partial(self):
487 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000488 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000489 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000490 "", # first byte of BOM read
491 "", # second byte of BOM read => byteorder known
492 "",
493 "\x00",
494 "\x00",
495 "\x00\xff",
496 "\x00\xff",
497 "\x00\xff\u0100",
498 "\x00\xff\u0100",
499 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000500 ]
501 )
502
Georg Brandl791f4e12009-09-17 11:41:24 +0000503 def test_handlers(self):
504 self.assertEqual(('\ufffd', 1),
505 codecs.utf_16_decode(b'\x01', 'replace', True))
506 self.assertEqual(('', 1),
507 codecs.utf_16_decode(b'\x01', 'ignore', True))
508
Walter Dörwalde22d3392005-11-17 08:52:34 +0000509 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000510 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000511 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000512
513 def test_decoder_state(self):
514 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000515 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000516 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000517 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000519 def test_bug691291(self):
520 # Files are always opened in binary mode, even if no binary mode was
521 # specified. This means that no automatic conversion of '\n' is done
522 # on reading and writing.
523 s1 = 'Hello\r\nworld\r\n'
524
525 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200526 self.addCleanup(support.unlink, support.TESTFN)
527 with open(support.TESTFN, 'wb') as fp:
528 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200529 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200530 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000531
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000532class UTF16LETest(ReadTest):
533 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000534
535 def test_partial(self):
536 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000537 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000538 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000539 "",
540 "\x00",
541 "\x00",
542 "\x00\xff",
543 "\x00\xff",
544 "\x00\xff\u0100",
545 "\x00\xff\u0100",
546 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000547 ]
548 )
549
Walter Dörwalde22d3392005-11-17 08:52:34 +0000550 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000551 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000552 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000553
Victor Stinner53a9dd72010-12-08 22:25:45 +0000554 def test_nonbmp(self):
555 self.assertEqual("\U00010203".encode(self.encoding),
556 b'\x00\xd8\x03\xde')
557 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
558 "\U00010203")
559
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000560class UTF16BETest(ReadTest):
561 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000562
563 def test_partial(self):
564 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000565 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000566 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000567 "",
568 "\x00",
569 "\x00",
570 "\x00\xff",
571 "\x00\xff",
572 "\x00\xff\u0100",
573 "\x00\xff\u0100",
574 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 ]
576 )
577
Walter Dörwalde22d3392005-11-17 08:52:34 +0000578 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000579 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000580 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581
Victor Stinner53a9dd72010-12-08 22:25:45 +0000582 def test_nonbmp(self):
583 self.assertEqual("\U00010203".encode(self.encoding),
584 b'\xd8\x00\xde\x03')
585 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
586 "\U00010203")
587
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588class UTF8Test(ReadTest):
589 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000590
591 def test_partial(self):
592 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000593 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000594 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000595 "\x00",
596 "\x00",
597 "\x00\xff",
598 "\x00\xff",
599 "\x00\xff\u07ff",
600 "\x00\xff\u07ff",
601 "\x00\xff\u07ff",
602 "\x00\xff\u07ff\u0800",
603 "\x00\xff\u07ff\u0800",
604 "\x00\xff\u07ff\u0800",
605 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000606 ]
607 )
608
Walter Dörwald3abcb012007-04-16 22:10:50 +0000609 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000610 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000611 self.check_state_handling_decode(self.encoding,
612 u, u.encode(self.encoding))
613
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000614 def test_lone_surrogates(self):
615 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
616 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000617 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
618 b'[\\udc80]')
619 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
620 b'[&#56448;]')
621 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
622 b'[\x80]')
623 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
624 b'[]')
625 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
626 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000627
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000628 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000629 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
630 b"abc\xed\xa0\x80def")
631 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
632 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
634 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
635 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
636 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000637 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000638
Walter Dörwalde22d3392005-11-17 08:52:34 +0000639class UTF7Test(ReadTest):
640 encoding = "utf-7"
641
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000642 def test_partial(self):
643 self.check_partial(
644 "a+-b",
645 [
646 "a",
647 "a",
648 "a+",
649 "a+-",
650 "a+-b",
651 ]
652 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000653
654class UTF16ExTest(unittest.TestCase):
655
656 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000657 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000658
659 def test_bad_args(self):
660 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
661
662class ReadBufferTest(unittest.TestCase):
663
664 def test_array(self):
665 import array
666 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000667 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000668 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000669 )
670
671 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000672 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000673
674 def test_bad_args(self):
675 self.assertRaises(TypeError, codecs.readbuffer_encode)
676 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
677
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000678class UTF8SigTest(ReadTest):
679 encoding = "utf-8-sig"
680
681 def test_partial(self):
682 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000683 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000684 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000685 "",
686 "",
687 "", # First BOM has been read and skipped
688 "",
689 "",
690 "\ufeff", # Second BOM has been read and emitted
691 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000692 "\ufeff\x00", # First byte of encoded "\xff" read
693 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
694 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
695 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000696 "\ufeff\x00\xff\u07ff",
697 "\ufeff\x00\xff\u07ff",
698 "\ufeff\x00\xff\u07ff\u0800",
699 "\ufeff\x00\xff\u07ff\u0800",
700 "\ufeff\x00\xff\u07ff\u0800",
701 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000702 ]
703 )
704
Thomas Wouters89f507f2006-12-13 04:49:30 +0000705 def test_bug1601501(self):
706 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000707 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000708
Walter Dörwald3abcb012007-04-16 22:10:50 +0000709 def test_bom(self):
710 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000711 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000712 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
713
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000714 def test_stream_bom(self):
715 unistring = "ABC\u00A1\u2200XYZ"
716 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
717
718 reader = codecs.getreader("utf-8-sig")
719 for sizehint in [None] + list(range(1, 11)) + \
720 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200721 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000722 ostream = io.StringIO()
723 while 1:
724 if sizehint is not None:
725 data = istream.read(sizehint)
726 else:
727 data = istream.read()
728
729 if not data:
730 break
731 ostream.write(data)
732
733 got = ostream.getvalue()
734 self.assertEqual(got, unistring)
735
736 def test_stream_bare(self):
737 unistring = "ABC\u00A1\u2200XYZ"
738 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
739
740 reader = codecs.getreader("utf-8-sig")
741 for sizehint in [None] + list(range(1, 11)) + \
742 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200743 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000744 ostream = io.StringIO()
745 while 1:
746 if sizehint is not None:
747 data = istream.read(sizehint)
748 else:
749 data = istream.read()
750
751 if not data:
752 break
753 ostream.write(data)
754
755 got = ostream.getvalue()
756 self.assertEqual(got, unistring)
757
758class EscapeDecodeTest(unittest.TestCase):
759 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000760 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000761
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000762class RecodingTest(unittest.TestCase):
763 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000764 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200765 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000766 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000767 f2.close()
768 # Python used to crash on this at exit because of a refcount
769 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000770
Martin v. Löwis2548c732003-04-18 10:39:54 +0000771# From RFC 3492
772punycode_testcases = [
773 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000774 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
775 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000776 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000777 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000778 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000779 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000780 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000781 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000782 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000783 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
785 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
786 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000787 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
790 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
791 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000792 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000793 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000794 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000795 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
796 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
797 "\u0939\u0948\u0902",
798 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000799
800 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000802 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
803 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000804
805 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000806 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
807 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
808 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000809 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
810 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000811
812 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000813 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
814 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
815 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
816 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000817 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000818
819 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000820 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
821 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
822 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
823 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
824 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000825 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000826
827 # (K) Vietnamese:
828 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
829 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
831 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
832 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
833 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000834 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000835
Martin v. Löwis2548c732003-04-18 10:39:54 +0000836 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000837 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000838 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000839
Martin v. Löwis2548c732003-04-18 10:39:54 +0000840 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000841 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
842 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
843 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000844 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
848 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
849 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000850 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000851
852 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000853 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000854 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000857 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
858 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000859 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000860
861 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000862 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000863 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000864
865 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000866 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000867 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000868
869 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000870 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
871 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000872 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000873 ]
874
875for i in punycode_testcases:
876 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000877 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000878
879class PunycodeTest(unittest.TestCase):
880 def test_encode(self):
881 for uni, puny in punycode_testcases:
882 # Need to convert both strings to lower case, since
883 # some of the extended encodings use upper case, but our
884 # code produces only lower case. Converting just puny to
885 # lower is also insufficient, since some of the input characters
886 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000887 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000888 str(uni.encode("punycode"), "ascii").lower(),
889 str(puny, "ascii").lower()
890 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000891
892 def test_decode(self):
893 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000894 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000895 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000896 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000897
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000898class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +0200899 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000900 def test_bug1251300(self):
901 # Decoding with unicode_internal used to not correctly handle "code
902 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +0200903 ok = [
904 (b"\x00\x10\xff\xff", "\U0010ffff"),
905 (b"\x00\x00\x01\x01", "\U00000101"),
906 (b"", ""),
907 ]
908 not_ok = [
909 b"\x7f\xff\xff\xff",
910 b"\x80\x00\x00\x00",
911 b"\x81\x00\x00\x00",
912 b"\x00",
913 b"\x00\x00\x00\x00\x00",
914 ]
915 for internal, uni in ok:
916 if sys.byteorder == "little":
917 internal = bytes(reversed(internal))
918 self.assertEqual(uni, internal.decode("unicode_internal"))
919 for internal in not_ok:
920 if sys.byteorder == "little":
921 internal = bytes(reversed(internal))
922 self.assertRaises(UnicodeDecodeError, internal.decode,
923 "unicode_internal")
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000924
Victor Stinner182d90d2011-09-29 19:53:55 +0200925 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000926 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +0200927 try:
928 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
929 except UnicodeDecodeError as ex:
930 self.assertEqual("unicode_internal", ex.encoding)
931 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
932 self.assertEqual(4, ex.start)
933 self.assertEqual(8, ex.end)
934 else:
935 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000936
Victor Stinner182d90d2011-09-29 19:53:55 +0200937 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000938 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +0200939 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
940 decoder = codecs.getdecoder("unicode_internal")
941 ab = "ab".encode("unicode_internal").decode()
942 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
943 "ascii"),
944 "UnicodeInternalTest")
945 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000946
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000947 def test_encode_length(self):
948 # Issue 3739
949 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000950 self.assertEqual(encoder("a")[1], 1)
951 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000952
Ezio Melottib3aedd42010-11-20 19:04:17 +0000953 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000954
Martin v. Löwis2548c732003-04-18 10:39:54 +0000955# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
956nameprep_tests = [
957 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
959 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
960 b'\xb8\x8f\xef\xbb\xbf',
961 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000962 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000963 (b'CAFE',
964 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000965 # 3.3 Case folding 8bit U+00DF (german sharp s).
966 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000967 (b'\xc3\x9f',
968 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000969 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000970 (b'\xc4\xb0',
971 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000973 (b'\xc5\x83\xcd\xba',
974 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000975 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
976 # XXX: skip this as it fails in UCS-2 mode
977 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
978 # 'telc\xe2\x88\x95kg\xcf\x83'),
979 (None, None),
980 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000981 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
982 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000984 (b'\xe1\xbe\xb7',
985 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986 # 3.9 Self-reverting case folding U+01F0 and normalization.
987 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000988 (b'\xc7\xb0',
989 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000991 (b'\xce\x90',
992 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000993 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000994 (b'\xce\xb0',
995 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000997 (b'\xe1\xba\x96',
998 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000999 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001000 (b'\xe1\xbd\x96',
1001 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001003 (b' ',
1004 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001006 (b'\xc2\xa0',
1007 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001010 None),
1011 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001012 (b'\xe2\x80\x80',
1013 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001015 (b'\xe2\x80\x8b',
1016 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001017 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001018 (b'\xe3\x80\x80',
1019 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001021 (b'\x10\x7f',
1022 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001024 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025 None),
1026 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028 None),
1029 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001030 (b'\xef\xbb\xbf',
1031 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001033 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001034 None),
1035 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001036 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001037 None),
1038 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001039 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001040 None),
1041 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001042 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001043 None),
1044 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001045 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046 None),
1047 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001048 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001049 None),
1050 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001051 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001052 None),
1053 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001054 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001055 None),
1056 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001057 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001058 None),
1059 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001060 (b'\xcd\x81',
1061 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001063 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001064 None),
1065 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001066 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001067 None),
1068 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001069 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070 None),
1071 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001072 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001073 None),
1074 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001075 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 None),
1077 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001079 None),
1080 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001081 (b'foo\xef\xb9\xb6bar',
1082 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001084 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085 None),
1086 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001087 (b'\xd8\xa71\xd8\xa8',
1088 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001090 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001092 # None),
1093 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094 # 3.44 Larger test (shrinking).
1095 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001096 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1097 b'\xaa\xce\xb0\xe2\x80\x80',
1098 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001099 # 3.45 Larger test (expanding).
1100 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001101 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1102 b'\x80',
1103 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1104 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1105 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001106 ]
1107
1108
1109class NameprepTest(unittest.TestCase):
1110 def test_nameprep(self):
1111 from encodings.idna import nameprep
1112 for pos, (orig, prepped) in enumerate(nameprep_tests):
1113 if orig is None:
1114 # Skipped
1115 continue
1116 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001117 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118 if prepped is None:
1119 # Input contains prohibited characters
1120 self.assertRaises(UnicodeError, nameprep, orig)
1121 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001122 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001124 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001125 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001126 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001127
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001128class IDNACodecTest(unittest.TestCase):
1129 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001130 self.assertEqual(str(b"python.org", "idna"), "python.org")
1131 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1132 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1133 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001134
1135 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001136 self.assertEqual("python.org".encode("idna"), b"python.org")
1137 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1138 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1139 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001140
Martin v. Löwis8b595142005-08-25 11:03:38 +00001141 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001142 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001143 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001144 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001145
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001146 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001147 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001148 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001149 "python.org"
1150 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001151 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001152 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001153 "python.org."
1154 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001155 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001156 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001157 "pyth\xf6n.org."
1158 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001159 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001160 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001161 "pyth\xf6n.org."
1162 )
1163
1164 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001165 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1166 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1167 self.assertEqual(decoder.decode(b"rg"), "")
1168 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001169
1170 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001171 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1172 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1173 self.assertEqual(decoder.decode(b"rg."), "org.")
1174 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001175
1176 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001177 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 b"".join(codecs.iterencode("python.org", "idna")),
1179 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001180 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001181 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001182 b"".join(codecs.iterencode("python.org.", "idna")),
1183 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001184 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001185 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001186 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1187 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001188 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001189 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1191 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192 )
1193
1194 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001195 self.assertEqual(encoder.encode("\xe4x"), b"")
1196 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1197 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001198
1199 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001200 self.assertEqual(encoder.encode("\xe4x"), b"")
1201 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1202 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001203
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001204class CodecsModuleTest(unittest.TestCase):
1205
1206 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001207 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1208 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001209 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001210 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001211 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001212
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001213 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001214 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1215 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001216 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001217 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001218 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001219 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001220
1221 def test_register(self):
1222 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001223 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001224
1225 def test_lookup(self):
1226 self.assertRaises(TypeError, codecs.lookup)
1227 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001228 self.assertRaises(LookupError, codecs.lookup, " ")
1229
1230 def test_getencoder(self):
1231 self.assertRaises(TypeError, codecs.getencoder)
1232 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1233
1234 def test_getdecoder(self):
1235 self.assertRaises(TypeError, codecs.getdecoder)
1236 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1237
1238 def test_getreader(self):
1239 self.assertRaises(TypeError, codecs.getreader)
1240 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1241
1242 def test_getwriter(self):
1243 self.assertRaises(TypeError, codecs.getwriter)
1244 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001245
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001246 def test_lookup_issue1813(self):
1247 # Issue #1813: under Turkish locales, lookup of some codecs failed
1248 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001249 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001250 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1251 try:
1252 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1253 except locale.Error:
1254 # Unsupported locale on this system
1255 self.skipTest('test needs Turkish locale')
1256 c = codecs.lookup('ASCII')
1257 self.assertEqual(c.name, 'ascii')
1258
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001259class StreamReaderTest(unittest.TestCase):
1260
1261 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001262 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001263 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001264
1265 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001266 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001267 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001268
Thomas Wouters89f507f2006-12-13 04:49:30 +00001269class EncodedFileTest(unittest.TestCase):
1270
1271 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001272 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001273 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001274 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001275
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001276 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001277 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001278 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001279 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001280
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001281all_unicode_encodings = [
1282 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001283 "big5",
1284 "big5hkscs",
1285 "charmap",
1286 "cp037",
1287 "cp1006",
1288 "cp1026",
1289 "cp1140",
1290 "cp1250",
1291 "cp1251",
1292 "cp1252",
1293 "cp1253",
1294 "cp1254",
1295 "cp1255",
1296 "cp1256",
1297 "cp1257",
1298 "cp1258",
1299 "cp424",
1300 "cp437",
1301 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001302 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001303 "cp737",
1304 "cp775",
1305 "cp850",
1306 "cp852",
1307 "cp855",
1308 "cp856",
1309 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001310 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001311 "cp860",
1312 "cp861",
1313 "cp862",
1314 "cp863",
1315 "cp864",
1316 "cp865",
1317 "cp866",
1318 "cp869",
1319 "cp874",
1320 "cp875",
1321 "cp932",
1322 "cp949",
1323 "cp950",
1324 "euc_jis_2004",
1325 "euc_jisx0213",
1326 "euc_jp",
1327 "euc_kr",
1328 "gb18030",
1329 "gb2312",
1330 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001331 "hp_roman8",
1332 "hz",
1333 "idna",
1334 "iso2022_jp",
1335 "iso2022_jp_1",
1336 "iso2022_jp_2",
1337 "iso2022_jp_2004",
1338 "iso2022_jp_3",
1339 "iso2022_jp_ext",
1340 "iso2022_kr",
1341 "iso8859_1",
1342 "iso8859_10",
1343 "iso8859_11",
1344 "iso8859_13",
1345 "iso8859_14",
1346 "iso8859_15",
1347 "iso8859_16",
1348 "iso8859_2",
1349 "iso8859_3",
1350 "iso8859_4",
1351 "iso8859_5",
1352 "iso8859_6",
1353 "iso8859_7",
1354 "iso8859_8",
1355 "iso8859_9",
1356 "johab",
1357 "koi8_r",
1358 "koi8_u",
1359 "latin_1",
1360 "mac_cyrillic",
1361 "mac_greek",
1362 "mac_iceland",
1363 "mac_latin2",
1364 "mac_roman",
1365 "mac_turkish",
1366 "palmos",
1367 "ptcp154",
1368 "punycode",
1369 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001370 "shift_jis",
1371 "shift_jis_2004",
1372 "shift_jisx0213",
1373 "tis_620",
1374 "unicode_escape",
1375 "unicode_internal",
1376 "utf_16",
1377 "utf_16_be",
1378 "utf_16_le",
1379 "utf_7",
1380 "utf_8",
1381]
1382
1383if hasattr(codecs, "mbcs_encode"):
1384 all_unicode_encodings.append("mbcs")
1385
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001386# The following encoding is not tested, because it's not supposed
1387# to work:
1388# "undefined"
1389
1390# The following encodings don't work in stateful mode
1391broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001392 "punycode",
1393 "unicode_internal"
1394]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001395broken_incremental_coders = broken_unicode_with_streams + [
1396 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001397]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001398
Walter Dörwald3abcb012007-04-16 22:10:50 +00001399class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001400 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001401 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001402 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001403 name = codecs.lookup(encoding).name
1404 if encoding.endswith("_codec"):
1405 name += "_codec"
1406 elif encoding == "latin_1":
1407 name = "latin_1"
1408 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001409 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001410 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001411 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1413
1414 if encoding not in broken_unicode_with_streams:
1415 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001416 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001417 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001418 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001419 for c in s:
1420 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001421 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001422 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001423 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001424 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001425 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001426 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001427 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001428 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001429 decodedresult += reader.read()
1430 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1431
Thomas Wouters89f507f2006-12-13 04:49:30 +00001432 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001433 # check incremental decoder/encoder (fetched via the Python
1434 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001435 try:
1436 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001437 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001438 except LookupError: # no IncrementalEncoder
1439 pass
1440 else:
1441 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001442 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001443 for c in s:
1444 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001445 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001446 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001447 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001448 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001449 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001450 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001451 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1452
1453 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001454 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001455 for c in s:
1456 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001457 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001458 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001459 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001461 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001462 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001463 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1464
1465 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001466 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001467 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1468
1469 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001470 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1471 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001472
Victor Stinner554f3f02010-06-16 23:33:54 +00001473 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001474 # check incremental decoder/encoder with errors argument
1475 try:
1476 encoder = codecs.getincrementalencoder(encoding)("ignore")
1477 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1478 except LookupError: # no IncrementalEncoder
1479 pass
1480 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001481 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001482 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001483 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001484 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1485
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001486 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001487 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001488 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001489 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1490
Walter Dörwald729c31f2005-03-14 19:06:30 +00001491 def test_seek(self):
1492 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001493 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001494 for encoding in all_unicode_encodings:
1495 if encoding == "idna": # FIXME: See SF bug #1163178
1496 continue
1497 if encoding in broken_unicode_with_streams:
1498 continue
Victor Stinner05010702011-05-27 16:50:40 +02001499 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001500 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001501 # Test that calling seek resets the internal codec state and buffers
1502 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001503 data = reader.read()
1504 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001505
Walter Dörwalde22d3392005-11-17 08:52:34 +00001506 def test_bad_decode_args(self):
1507 for encoding in all_unicode_encodings:
1508 decoder = codecs.getdecoder(encoding)
1509 self.assertRaises(TypeError, decoder)
1510 if encoding not in ("idna", "punycode"):
1511 self.assertRaises(TypeError, decoder, 42)
1512
1513 def test_bad_encode_args(self):
1514 for encoding in all_unicode_encodings:
1515 encoder = codecs.getencoder(encoding)
1516 self.assertRaises(TypeError, encoder)
1517
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001518 def test_encoding_map_type_initialized(self):
1519 from encodings import cp1140
1520 # This used to crash, we are only verifying there's no crash.
1521 table_type = type(cp1140.encoding_table)
1522 self.assertEqual(table_type, table_type)
1523
Walter Dörwald3abcb012007-04-16 22:10:50 +00001524 def test_decoder_state(self):
1525 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001526 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001527 for encoding in all_unicode_encodings:
1528 if encoding not in broken_incremental_coders:
1529 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1530 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1531
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001532class CharmapTest(unittest.TestCase):
1533 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001534 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001535 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001537 )
1538
Ezio Melottib3aedd42010-11-20 19:04:17 +00001539 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001540 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001541 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001542 )
1543
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001545 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001546 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001547 )
1548
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001550 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001552 )
1553
Ezio Melottib3aedd42010-11-20 19:04:17 +00001554 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001555 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001557 )
1558
Guido van Rossum805365e2007-05-07 22:24:25 +00001559 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001560 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561 codecs.charmap_decode(allbytes, "ignore", ""),
1562 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001563 )
1564
Thomas Wouters89f507f2006-12-13 04:49:30 +00001565class WithStmtTest(unittest.TestCase):
1566 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001567 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001568 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1569 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001570
1571 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001572 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001573 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001574 with codecs.StreamReaderWriter(f, info.streamreader,
1575 info.streamwriter, 'strict') as srw:
1576 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001577
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001578class TypesTest(unittest.TestCase):
1579 def test_decode_unicode(self):
1580 # Most decoders don't accept unicode input
1581 decoders = [
1582 codecs.utf_7_decode,
1583 codecs.utf_8_decode,
1584 codecs.utf_16_le_decode,
1585 codecs.utf_16_be_decode,
1586 codecs.utf_16_ex_decode,
1587 codecs.utf_32_decode,
1588 codecs.utf_32_le_decode,
1589 codecs.utf_32_be_decode,
1590 codecs.utf_32_ex_decode,
1591 codecs.latin_1_decode,
1592 codecs.ascii_decode,
1593 codecs.charmap_decode,
1594 ]
1595 if hasattr(codecs, "mbcs_decode"):
1596 decoders.append(codecs.mbcs_decode)
1597 for decoder in decoders:
1598 self.assertRaises(TypeError, decoder, "xxx")
1599
1600 def test_unicode_escape(self):
1601 # Escape-decoding an unicode string is supported ang gives the same
1602 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001603 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1604 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1605 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1606 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001607
Martin v. Löwis43c57782009-05-10 08:15:24 +00001608class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001609
1610 def test_utf8(self):
1611 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001612 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001613 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001614 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001615 b"foo\x80bar")
1616 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001617 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001618 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001619 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001620 b"\xed\xb0\x80")
1621
1622 def test_ascii(self):
1623 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001624 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001625 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001626 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001627 b"foo\x80bar")
1628
1629 def test_charmap(self):
1630 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001631 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001632 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001633 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001634 b"foo\xa5bar")
1635
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001636 def test_latin1(self):
1637 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001638 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001639 b"\xe4\xeb\xef\xf6\xfc")
1640
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001641
Victor Stinner3fed0872010-05-22 02:16:27 +00001642class BomTest(unittest.TestCase):
1643 def test_seek0(self):
1644 data = "1234567890"
1645 tests = ("utf-16",
1646 "utf-16-le",
1647 "utf-16-be",
1648 "utf-32",
1649 "utf-32-le",
1650 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001651 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001652 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001653 # Check if the BOM is written only once
1654 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001655 f.write(data)
1656 f.write(data)
1657 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001658 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001659 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001660 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001661
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001662 # Check that the BOM is written after a seek(0)
1663 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1664 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001665 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001666 f.seek(0)
1667 f.write(data)
1668 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001669 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001670
1671 # (StreamWriter) Check that the BOM is written after a seek(0)
1672 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001673 f.writer.write(data[0])
1674 self.assertNotEqual(f.writer.tell(), 0)
1675 f.writer.seek(0)
1676 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001677 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001678 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001679
Victor Stinner05010702011-05-27 16:50:40 +02001680 # Check that the BOM is not written after a seek() at a position
1681 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001682 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1683 f.write(data)
1684 f.seek(f.tell())
1685 f.write(data)
1686 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001687 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001688
Victor Stinner05010702011-05-27 16:50:40 +02001689 # (StreamWriter) Check that the BOM is not written after a seek()
1690 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001691 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001692 f.writer.write(data)
1693 f.writer.seek(f.writer.tell())
1694 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001695 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001696 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001697
Victor Stinner3fed0872010-05-22 02:16:27 +00001698
Georg Brandl02524622010-12-02 18:06:51 +00001699bytes_transform_encodings = [
1700 "base64_codec",
1701 "uu_codec",
1702 "quopri_codec",
1703 "hex_codec",
1704]
1705try:
1706 import zlib
1707except ImportError:
1708 pass
1709else:
1710 bytes_transform_encodings.append("zlib_codec")
1711try:
1712 import bz2
1713except ImportError:
1714 pass
1715else:
1716 bytes_transform_encodings.append("bz2_codec")
1717
1718class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001719
Georg Brandl02524622010-12-02 18:06:51 +00001720 def test_basics(self):
1721 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001722 for encoding in bytes_transform_encodings:
1723 # generic codecs interface
1724 (o, size) = codecs.getencoder(encoding)(binput)
1725 self.assertEqual(size, len(binput))
1726 (i, size) = codecs.getdecoder(encoding)(o)
1727 self.assertEqual(size, len(o))
1728 self.assertEqual(i, binput)
1729
Georg Brandl02524622010-12-02 18:06:51 +00001730 def test_read(self):
1731 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001732 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001733 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001734 sout = reader.read()
1735 self.assertEqual(sout, b"\x80")
1736
1737 def test_readline(self):
1738 for encoding in bytes_transform_encodings:
1739 if encoding in ['uu_codec', 'zlib_codec']:
1740 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001741 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001742 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001743 sout = reader.readline()
1744 self.assertEqual(sout, b"\x80")
1745
1746
Victor Stinner62be4fb2011-10-18 21:46:37 +02001747@unittest.skipUnless(sys.platform == 'win32',
1748 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02001749class CodePageTest(unittest.TestCase):
1750 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02001751
1752 def vista_or_later(self):
1753 return (sys.getwindowsversion().major >= 6)
Victor Stinner3a50e702011-10-18 21:21:00 +02001754
1755 def test_invalid_code_page(self):
1756 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
1757 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
1758 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
1759 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
1760
1761 def test_code_page_name(self):
1762 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
1763 codecs.code_page_encode, 932, '\xff')
1764 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
1765 codecs.code_page_decode, 932, b'\x81\x00')
1766 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
1767 codecs.code_page_decode, self.CP_UTF8, b'\xff')
1768
1769 def check_decode(self, cp, tests):
1770 for raw, errors, expected in tests:
1771 if expected is not None:
1772 try:
1773 decoded = codecs.code_page_decode(cp, raw, errors)
1774 except UnicodeDecodeError as err:
1775 self.fail('Unable to decode %a from "cp%s" with '
1776 'errors=%r: %s' % (raw, cp, errors, err))
1777 self.assertEqual(decoded[0], expected,
1778 '%a.decode("cp%s", %r)=%a != %a'
1779 % (raw, cp, errors, decoded[0], expected))
1780 # assert 0 <= decoded[1] <= len(raw)
1781 self.assertGreaterEqual(decoded[1], 0)
1782 self.assertLessEqual(decoded[1], len(raw))
1783 else:
1784 self.assertRaises(UnicodeDecodeError,
1785 codecs.code_page_decode, cp, raw, errors)
1786
1787 def check_encode(self, cp, tests):
1788 for text, errors, expected in tests:
1789 if expected is not None:
1790 try:
1791 encoded = codecs.code_page_encode(cp, text, errors)
1792 except UnicodeEncodeError as err:
1793 self.fail('Unable to encode %a to "cp%s" with '
1794 'errors=%r: %s' % (text, cp, errors, err))
1795 self.assertEqual(encoded[0], expected,
1796 '%a.encode("cp%s", %r)=%a != %a'
1797 % (text, cp, errors, encoded[0], expected))
1798 self.assertEqual(encoded[1], len(text))
1799 else:
1800 self.assertRaises(UnicodeEncodeError,
1801 codecs.code_page_encode, cp, text, errors)
1802
1803 def test_cp932(self):
1804 self.check_encode(932, (
1805 ('abc', 'strict', b'abc'),
1806 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
1807 # not encodable
1808 ('\xff', 'strict', None),
1809 ('[\xff]', 'ignore', b'[]'),
1810 ('[\xff]', 'replace', b'[y]'),
1811 ('[\u20ac]', 'replace', b'[?]'),
1812 ))
Victor Stinner9e921882011-10-18 21:55:25 +02001813 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02001814 (b'abc', 'strict', 'abc'),
1815 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
1816 # invalid bytes
1817 (b'\xff', 'strict', None),
1818 (b'\xff', 'ignore', ''),
1819 (b'\xff', 'replace', '\ufffd'),
1820 (b'\x81\x00abc', 'strict', None),
1821 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02001822 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
1823 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02001824
1825 def test_cp1252(self):
1826 self.check_encode(1252, (
1827 ('abc', 'strict', b'abc'),
1828 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
1829 ('\xff', 'strict', b'\xff'),
1830 ('\u0141', 'strict', None),
1831 ('\u0141', 'ignore', b''),
1832 ('\u0141', 'replace', b'L'),
1833 ))
1834 self.check_decode(1252, (
1835 (b'abc', 'strict', 'abc'),
1836 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
1837 (b'\xff', 'strict', '\xff'),
1838 ))
1839
1840 def test_cp_utf7(self):
1841 cp = 65000
1842 self.check_encode(cp, (
1843 ('abc', 'strict', b'abc'),
1844 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
1845 ('\U0010ffff', 'strict', b'+2//f/w-'),
1846 ('\udc80', 'strict', b'+3IA-'),
1847 ('\ufffd', 'strict', b'+//0-'),
1848 ))
1849 self.check_decode(cp, (
1850 (b'abc', 'strict', 'abc'),
1851 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
1852 (b'+2//f/w-', 'strict', '\U0010ffff'),
1853 (b'+3IA-', 'strict', '\udc80'),
1854 (b'+//0-', 'strict', '\ufffd'),
1855 # invalid bytes
1856 (b'[+/]', 'strict', '[]'),
1857 (b'[\xff]', 'strict', '[\xff]'),
1858 ))
1859
1860 def test_cp_utf8(self):
1861 cp = self.CP_UTF8
1862
1863 tests = [
1864 ('abc', 'strict', b'abc'),
1865 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
1866 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
1867 ]
Victor Stinner62be4fb2011-10-18 21:46:37 +02001868 if self.vista_or_later():
Victor Stinner3a50e702011-10-18 21:21:00 +02001869 tests.append(('\udc80', 'strict', None))
1870 tests.append(('\udc80', 'ignore', b''))
1871 tests.append(('\udc80', 'replace', b'?'))
1872 else:
1873 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
1874 self.check_encode(cp, tests)
1875
1876 tests = [
1877 (b'abc', 'strict', 'abc'),
1878 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
1879 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
1880 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
1881 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
1882 # invalid bytes
1883 (b'[\xff]', 'strict', None),
1884 (b'[\xff]', 'ignore', '[]'),
1885 (b'[\xff]', 'replace', '[\ufffd]'),
1886 ]
Victor Stinner62be4fb2011-10-18 21:46:37 +02001887 if self.vista_or_later():
Victor Stinner3a50e702011-10-18 21:21:00 +02001888 tests.extend((
1889 (b'[\xed\xb2\x80]', 'strict', None),
1890 (b'[\xed\xb2\x80]', 'ignore', '[]'),
1891 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
1892 ))
1893 else:
1894 tests.extend((
1895 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
1896 ))
1897 self.check_decode(cp, tests)
1898
1899 def test_error_handlers(self):
1900 self.check_encode(932, (
1901 ('\xff', 'backslashreplace', b'\\xff'),
1902 ('\xff', 'xmlcharrefreplace', b'&#255;'),
1903 ))
1904 self.check_decode(932, (
1905 (b'\xff', 'surrogateescape', '\udcff'),
1906 ))
Victor Stinner62be4fb2011-10-18 21:46:37 +02001907 if self.vista_or_later():
Victor Stinner3a50e702011-10-18 21:21:00 +02001908 self.check_encode(self.CP_UTF8, (
1909 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
1910 ))
1911
1912 def test_multibyte_encoding(self):
1913 self.check_decode(932, (
1914 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
1915 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
1916 ))
1917 self.check_decode(self.CP_UTF8, (
1918 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
1919 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
1920 ))
Victor Stinner62be4fb2011-10-18 21:46:37 +02001921 if self.vista_or_later():
Victor Stinner3a50e702011-10-18 21:21:00 +02001922 self.check_encode(self.CP_UTF8, (
1923 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
1924 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
1925 ))
1926
1927 def test_incremental(self):
1928 decoded = codecs.code_page_decode(932,
1929 b'\xe9\x80\xe9', 'strict',
1930 False)
1931 self.assertEqual(decoded, ('\u9a3e', 2))
1932
1933 decoded = codecs.code_page_decode(932,
1934 b'\xe9\x80\xe9\x80', 'strict',
1935 False)
1936 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
1937
1938 decoded = codecs.code_page_decode(932,
1939 b'abc', 'strict',
1940 False)
1941 self.assertEqual(decoded, ('abc', 3))
1942
1943
Fred Drake2e2be372001-09-20 21:33:42 +00001944def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001945 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001946 UTF32Test,
1947 UTF32LETest,
1948 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001949 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001950 UTF16LETest,
1951 UTF16BETest,
1952 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001953 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001954 UTF7Test,
1955 UTF16ExTest,
1956 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001957 RecodingTest,
1958 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001959 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001960 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001961 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001962 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001963 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001964 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001965 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001966 CharmapTest,
1967 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001968 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001969 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001970 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001971 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02001972 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001973 )
Fred Drake2e2be372001-09-20 21:33:42 +00001974
1975
1976if __name__ == "__main__":
1977 test_main()