blob: e74038be4d85d47c1601b326c0b8db45e3dcd2a0 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakad6793772013-01-29 10:20:44 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000016 def __init__(self, buffer):
17 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000018
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000025 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000026 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwald3abcb012007-04-16 22:10:50 +000032class MixInCheckStateHandling:
33 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000034 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000035 d = codecs.getincrementaldecoder(encoding)()
36 part1 = d.decode(s[:i])
37 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000038 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000039 # Check that the condition stated in the documentation for
40 # IncrementalDecoder.getstate() holds
41 if not state[1]:
42 # reset decoder to the default state without anything buffered
43 d.setstate((state[0][:0], 0))
44 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000045 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000046 # The decoder must return to the same state
47 self.assertEqual(state, d.getstate())
48 # Create a new decoder and set it to the state
49 # we extracted from the old one
50 d = codecs.getincrementaldecoder(encoding)()
51 d.setstate(state)
52 part2 = d.decode(s[i:], True)
53 self.assertEqual(u, part1+part2)
54
55 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000056 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 d = codecs.getincrementalencoder(encoding)()
58 part1 = d.encode(u[:i])
59 state = d.getstate()
60 d = codecs.getincrementalencoder(encoding)()
61 d.setstate(state)
62 part2 = d.encode(u[i:], True)
63 self.assertEqual(s, part1+part2)
64
65class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000067 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000068 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000069 # the StreamReader and check that the results equal the appropriate
70 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000071 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000074 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000076 result += r.read()
77 self.assertEqual(result, partialresult)
78 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000081
Thomas Woutersa9773292006-04-21 09:43:23 +000082 # do the check again, this time using a incremental decoder
83 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000084 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000085 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000087 self.assertEqual(result, partialresult)
88 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 self.assertEqual(d.decode(b"", True), "")
90 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000091
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000093 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000097 self.assertEqual(result, partialresult)
98 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 self.assertEqual(d.decode(b"", True), "")
100 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000101
102 # check iterdecode()
103 encoded = input.encode(self.encoding)
104 self.assertEqual(
105 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 )
108
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 def test_readline(self):
110 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000111 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000112 return codecs.getreader(self.encoding)(stream)
113
Walter Dörwaldca199432006-03-06 22:39:12 +0000114 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 reader = getreader(input)
116 lines = []
117 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000118 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000119 if not line:
120 break
121 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000123
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
125 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
126 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000127 self.assertEqual(readalllines(s, True), sexpected)
128 self.assertEqual(readalllines(s, False), sexpectednoends)
129 self.assertEqual(readalllines(s, True, 10), sexpected)
130 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131
132 # Test long lines (multiple calls to read() in readline())
133 vw = []
134 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
136 vw.append((i*200)*"\3042" + lineend)
137 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
139 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
140
141 # Test lines where the first read might end with \r, so the
142 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000143 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 for lineend in "\n \r\n \r \u2028".split():
145 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000153 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000154 self.assertEqual(
155 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000157 )
158
159 def test_bug1175396(self):
160 s = [
161 '<%!--===================================================\r\n',
162 ' BLOG index page: show recent articles,\r\n',
163 ' today\'s articles, or articles of a specific date.\r\n',
164 '========================================================--%>\r\n',
165 '<%@inputencoding="ISO-8859-1"%>\r\n',
166 '<%@pagetemplate=TEMPLATE.y%>\r\n',
167 '<%@import=import frog.util, frog%>\r\n',
168 '<%@import=import frog.objects%>\r\n',
169 '<%@import=from frog.storageerrors import StorageError%>\r\n',
170 '<%\r\n',
171 '\r\n',
172 'import logging\r\n',
173 'log=logging.getLogger("Snakelets.logger")\r\n',
174 '\r\n',
175 '\r\n',
176 'user=self.SessionCtx.user\r\n',
177 'storageEngine=self.SessionCtx.storageEngine\r\n',
178 '\r\n',
179 '\r\n',
180 'def readArticlesFromDate(date, count=None):\r\n',
181 ' entryids=storageEngine.listBlogEntries(date)\r\n',
182 ' entryids.reverse() # descending\r\n',
183 ' if count:\r\n',
184 ' entryids=entryids[:count]\r\n',
185 ' try:\r\n',
186 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
187 ' except StorageError,x:\r\n',
188 ' log.error("Error loading articles: "+str(x))\r\n',
189 ' self.abort("cannot load articles")\r\n',
190 '\r\n',
191 'showdate=None\r\n',
192 '\r\n',
193 'arg=self.Request.getArg()\r\n',
194 'if arg=="today":\r\n',
195 ' #-------------------- TODAY\'S ARTICLES\r\n',
196 ' self.write("<h2>Today\'s articles</h2>")\r\n',
197 ' showdate = frog.util.isodatestr() \r\n',
198 ' entries = readArticlesFromDate(showdate)\r\n',
199 'elif arg=="active":\r\n',
200 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
201 ' self.Yredirect("active.y")\r\n',
202 'elif arg=="login":\r\n',
203 ' #-------------------- LOGIN PAGE redirect\r\n',
204 ' self.Yredirect("login.y")\r\n',
205 'elif arg=="date":\r\n',
206 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
207 ' showdate = self.Request.getParameter("date")\r\n',
208 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'else:\r\n',
211 ' #-------------------- RECENT ARTICLES\r\n',
212 ' self.write("<h2>Recent articles</h2>")\r\n',
213 ' dates=storageEngine.listBlogEntryDates()\r\n',
214 ' if dates:\r\n',
215 ' entries=[]\r\n',
216 ' SHOWAMOUNT=10\r\n',
217 ' for showdate in dates:\r\n',
218 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
219 ' if len(entries)>=SHOWAMOUNT:\r\n',
220 ' break\r\n',
221 ' \r\n',
222 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000223 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000224 reader = codecs.getreader(self.encoding)(stream)
225 for (i, line) in enumerate(reader):
226 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000227
228 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000229 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000230 writer = codecs.getwriter(self.encoding)(q)
231 reader = codecs.getreader(self.encoding)(q)
232
233 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000234 writer.write("foo\r")
235 self.assertEqual(reader.readline(keepends=False), "foo")
236 writer.write("\nbar\r")
237 self.assertEqual(reader.readline(keepends=False), "")
238 self.assertEqual(reader.readline(keepends=False), "bar")
239 writer.write("baz")
240 self.assertEqual(reader.readline(keepends=False), "baz")
241 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000242
243 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000244 writer.write("foo\r")
245 self.assertEqual(reader.readline(keepends=True), "foo\r")
246 writer.write("\nbar\r")
247 self.assertEqual(reader.readline(keepends=True), "\n")
248 self.assertEqual(reader.readline(keepends=True), "bar\r")
249 writer.write("baz")
250 self.assertEqual(reader.readline(keepends=True), "baz")
251 self.assertEqual(reader.readline(keepends=True), "")
252 writer.write("foo\r\n")
253 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
Walter Dörwald9fa09462005-01-10 12:01:39 +0000255 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
257 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
258 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000259
260 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000261 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262 reader = codecs.getreader(self.encoding)(stream)
263 self.assertEqual(reader.readline(), s1)
264 self.assertEqual(reader.readline(), s2)
265 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000266 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267
268 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000269 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
270 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
271 s3 = "stillokay:bbbbxx\r\n"
272 s4 = "broken!!!!badbad\r\n"
273 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000274
275 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000276 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000277 reader = codecs.getreader(self.encoding)(stream)
278 self.assertEqual(reader.readline(), s1)
279 self.assertEqual(reader.readline(), s2)
280 self.assertEqual(reader.readline(), s3)
281 self.assertEqual(reader.readline(), s4)
282 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000283 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000284
Walter Dörwald41980ca2007-08-16 21:55:45 +0000285class UTF32Test(ReadTest):
286 encoding = "utf-32"
287
288 spamle = (b'\xff\xfe\x00\x00'
289 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
290 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
291 spambe = (b'\x00\x00\xfe\xff'
292 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
293 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
294
295 def test_only_one_bom(self):
296 _,_,reader,writer = codecs.lookup(self.encoding)
297 # encode some stream
298 s = io.BytesIO()
299 f = writer(s)
300 f.write("spam")
301 f.write("spam")
302 d = s.getvalue()
303 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000304 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000305 # try to read it back
306 s = io.BytesIO(d)
307 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000308 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000309
310 def test_badbom(self):
311 s = io.BytesIO(4*b"\xff")
312 f = codecs.getreader(self.encoding)(s)
313 self.assertRaises(UnicodeError, f.read)
314
315 s = io.BytesIO(8*b"\xff")
316 f = codecs.getreader(self.encoding)(s)
317 self.assertRaises(UnicodeError, f.read)
318
319 def test_partial(self):
320 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200321 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000322 [
323 "", # first byte of BOM read
324 "", # second byte of BOM read
325 "", # third byte of BOM read
326 "", # fourth byte of BOM read => byteorder known
327 "",
328 "",
329 "",
330 "\x00",
331 "\x00",
332 "\x00",
333 "\x00",
334 "\x00\xff",
335 "\x00\xff",
336 "\x00\xff",
337 "\x00\xff",
338 "\x00\xff\u0100",
339 "\x00\xff\u0100",
340 "\x00\xff\u0100",
341 "\x00\xff\u0100",
342 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200343 "\x00\xff\u0100\uffff",
344 "\x00\xff\u0100\uffff",
345 "\x00\xff\u0100\uffff",
346 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 ]
348 )
349
Georg Brandl791f4e12009-09-17 11:41:24 +0000350 def test_handlers(self):
351 self.assertEqual(('\ufffd', 1),
352 codecs.utf_32_decode(b'\x01', 'replace', True))
353 self.assertEqual(('', 1),
354 codecs.utf_32_decode(b'\x01', 'ignore', True))
355
Walter Dörwald41980ca2007-08-16 21:55:45 +0000356 def test_errors(self):
357 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
358 b"\xff", "strict", True)
359
360 def test_decoder_state(self):
361 self.check_state_handling_decode(self.encoding,
362 "spamspam", self.spamle)
363 self.check_state_handling_decode(self.encoding,
364 "spamspam", self.spambe)
365
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000366 def test_issue8941(self):
367 # Issue #8941: insufficient result allocation when decoding into
368 # surrogate pairs on UCS-2 builds.
369 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
370 self.assertEqual('\U00010000' * 1024,
371 codecs.utf_32_decode(encoded_le)[0])
372 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
373 self.assertEqual('\U00010000' * 1024,
374 codecs.utf_32_decode(encoded_be)[0])
375
Walter Dörwald41980ca2007-08-16 21:55:45 +0000376class UTF32LETest(ReadTest):
377 encoding = "utf-32-le"
378
379 def test_partial(self):
380 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200381 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000382 [
383 "",
384 "",
385 "",
386 "\x00",
387 "\x00",
388 "\x00",
389 "\x00",
390 "\x00\xff",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff",
394 "\x00\xff\u0100",
395 "\x00\xff\u0100",
396 "\x00\xff\u0100",
397 "\x00\xff\u0100",
398 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200399 "\x00\xff\u0100\uffff",
400 "\x00\xff\u0100\uffff",
401 "\x00\xff\u0100\uffff",
402 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000403 ]
404 )
405
406 def test_simple(self):
407 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
408
409 def test_errors(self):
410 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
411 b"\xff", "strict", True)
412
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000413 def test_issue8941(self):
414 # Issue #8941: insufficient result allocation when decoding into
415 # surrogate pairs on UCS-2 builds.
416 encoded = b'\x00\x00\x01\x00' * 1024
417 self.assertEqual('\U00010000' * 1024,
418 codecs.utf_32_le_decode(encoded)[0])
419
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420class UTF32BETest(ReadTest):
421 encoding = "utf-32-be"
422
423 def test_partial(self):
424 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200425 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000426 [
427 "",
428 "",
429 "",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff",
446 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 ]
448 )
449
450 def test_simple(self):
451 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
452
453 def test_errors(self):
454 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
455 b"\xff", "strict", True)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded = b'\x00\x01\x00\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_be_decode(encoded)[0])
463
464
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000465class UTF16Test(ReadTest):
466 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000468 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
469 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470
471 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000475 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 f.write("spam")
477 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 d = s.getvalue()
479 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000480 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000483 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000484 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000486 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000492 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000493 self.assertRaises(UnicodeError, f.read)
494
Walter Dörwald69652032004-09-07 20:24:22 +0000495 def test_partial(self):
496 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200497 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000498 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000499 "", # first byte of BOM read
500 "", # second byte of BOM read => byteorder known
501 "",
502 "\x00",
503 "\x00",
504 "\x00\xff",
505 "\x00\xff",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff",
512 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000513 ]
514 )
515
Georg Brandl791f4e12009-09-17 11:41:24 +0000516 def test_handlers(self):
517 self.assertEqual(('\ufffd', 1),
518 codecs.utf_16_decode(b'\x01', 'replace', True))
519 self.assertEqual(('', 1),
520 codecs.utf_16_decode(b'\x01', 'ignore', True))
521
Walter Dörwalde22d3392005-11-17 08:52:34 +0000522 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000523 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000524 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000525
526 def test_decoder_state(self):
527 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000529 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000531
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000532 def test_bug691291(self):
533 # Files are always opened in binary mode, even if no binary mode was
534 # specified. This means that no automatic conversion of '\n' is done
535 # on reading and writing.
536 s1 = 'Hello\r\nworld\r\n'
537
538 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200539 self.addCleanup(support.unlink, support.TESTFN)
540 with open(support.TESTFN, 'wb') as fp:
541 fp.write(s)
542 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
543 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000544
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545class UTF16LETest(ReadTest):
546 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000547
548 def test_partial(self):
549 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000551 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 "",
553 "\x00",
554 "\x00",
555 "\x00\xff",
556 "\x00\xff",
557 "\x00\xff\u0100",
558 "\x00\xff\u0100",
559 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200560 "\x00\xff\u0100\uffff",
561 "\x00\xff\u0100\uffff",
562 "\x00\xff\u0100\uffff",
563 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000564 ]
565 )
566
Walter Dörwalde22d3392005-11-17 08:52:34 +0000567 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200568 tests = [
569 (b'\xff', '\ufffd'),
570 (b'A\x00Z', 'A\ufffd'),
571 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
572 (b'\x00\xd8', '\ufffd'),
573 (b'\x00\xd8A', '\ufffd'),
574 (b'\x00\xd8A\x00', '\ufffdA'),
575 (b'\x00\xdcA\x00', '\ufffdA'),
576 ]
577 for raw, expected in tests:
578 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
579 raw, 'strict', True)
580 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581
Victor Stinner53a9dd72010-12-08 22:25:45 +0000582 def test_nonbmp(self):
583 self.assertEqual("\U00010203".encode(self.encoding),
584 b'\x00\xd8\x03\xde')
585 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
586 "\U00010203")
587
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588class UTF16BETest(ReadTest):
589 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000590
591 def test_partial(self):
592 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200593 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000594 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000595 "",
596 "\x00",
597 "\x00",
598 "\x00\xff",
599 "\x00\xff",
600 "\x00\xff\u0100",
601 "\x00\xff\u0100",
602 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200603 "\x00\xff\u0100\uffff",
604 "\x00\xff\u0100\uffff",
605 "\x00\xff\u0100\uffff",
606 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000607 ]
608 )
609
Walter Dörwalde22d3392005-11-17 08:52:34 +0000610 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200611 tests = [
612 (b'\xff', '\ufffd'),
613 (b'\x00A\xff', 'A\ufffd'),
614 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
615 (b'\xd8\x00', '\ufffd'),
616 (b'\xd8\x00\xdc', '\ufffd'),
617 (b'\xd8\x00\x00A', '\ufffdA'),
618 (b'\xdc\x00\x00A', '\ufffdA'),
619 ]
620 for raw, expected in tests:
621 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
622 raw, 'strict', True)
623 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000624
Victor Stinner53a9dd72010-12-08 22:25:45 +0000625 def test_nonbmp(self):
626 self.assertEqual("\U00010203".encode(self.encoding),
627 b'\xd8\x00\xde\x03')
628 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
629 "\U00010203")
630
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000631class UTF8Test(ReadTest):
632 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000633
634 def test_partial(self):
635 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200636 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000637 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000638 "\x00",
639 "\x00",
640 "\x00\xff",
641 "\x00\xff",
642 "\x00\xff\u07ff",
643 "\x00\xff\u07ff",
644 "\x00\xff\u07ff",
645 "\x00\xff\u07ff\u0800",
646 "\x00\xff\u07ff\u0800",
647 "\x00\xff\u07ff\u0800",
648 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200649 "\x00\xff\u07ff\u0800\uffff",
650 "\x00\xff\u07ff\u0800\uffff",
651 "\x00\xff\u07ff\u0800\uffff",
652 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000653 ]
654 )
655
Walter Dörwald3abcb012007-04-16 22:10:50 +0000656 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000657 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000658 self.check_state_handling_decode(self.encoding,
659 u, u.encode(self.encoding))
660
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000661 def test_lone_surrogates(self):
662 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
663 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000664 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
665 b'[\\udc80]')
666 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
667 b'[&#56448;]')
668 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
669 b'[\x80]')
670 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
671 b'[]')
672 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
673 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000674
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000675 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000676 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
677 b"abc\xed\xa0\x80def")
678 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
679 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000680 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700681 with self.assertRaises(UnicodeDecodeError):
682 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200683 with self.assertRaises(UnicodeDecodeError):
684 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000685
Walter Dörwalde22d3392005-11-17 08:52:34 +0000686class UTF7Test(ReadTest):
687 encoding = "utf-7"
688
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000689 def test_partial(self):
690 self.check_partial(
691 "a+-b",
692 [
693 "a",
694 "a",
695 "a+",
696 "a+-",
697 "a+-b",
698 ]
699 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000700
701class UTF16ExTest(unittest.TestCase):
702
703 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000704 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000705
706 def test_bad_args(self):
707 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
708
709class ReadBufferTest(unittest.TestCase):
710
711 def test_array(self):
712 import array
713 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000714 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000715 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000716 )
717
718 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000719 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000720
721 def test_bad_args(self):
722 self.assertRaises(TypeError, codecs.readbuffer_encode)
723 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
724
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000725class UTF8SigTest(ReadTest):
726 encoding = "utf-8-sig"
727
728 def test_partial(self):
729 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200730 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000731 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000732 "",
733 "",
734 "", # First BOM has been read and skipped
735 "",
736 "",
737 "\ufeff", # Second BOM has been read and emitted
738 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000739 "\ufeff\x00", # First byte of encoded "\xff" read
740 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
741 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
742 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000743 "\ufeff\x00\xff\u07ff",
744 "\ufeff\x00\xff\u07ff",
745 "\ufeff\x00\xff\u07ff\u0800",
746 "\ufeff\x00\xff\u07ff\u0800",
747 "\ufeff\x00\xff\u07ff\u0800",
748 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200749 "\ufeff\x00\xff\u07ff\u0800\uffff",
750 "\ufeff\x00\xff\u07ff\u0800\uffff",
751 "\ufeff\x00\xff\u07ff\u0800\uffff",
752 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000753 ]
754 )
755
Thomas Wouters89f507f2006-12-13 04:49:30 +0000756 def test_bug1601501(self):
757 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000758 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000759
Walter Dörwald3abcb012007-04-16 22:10:50 +0000760 def test_bom(self):
761 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000762 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000763 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
764
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000765 def test_stream_bom(self):
766 unistring = "ABC\u00A1\u2200XYZ"
767 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
768
769 reader = codecs.getreader("utf-8-sig")
770 for sizehint in [None] + list(range(1, 11)) + \
771 [64, 128, 256, 512, 1024]:
772 istream = reader(io.BytesIO(bytestring))
773 ostream = io.StringIO()
774 while 1:
775 if sizehint is not None:
776 data = istream.read(sizehint)
777 else:
778 data = istream.read()
779
780 if not data:
781 break
782 ostream.write(data)
783
784 got = ostream.getvalue()
785 self.assertEqual(got, unistring)
786
787 def test_stream_bare(self):
788 unistring = "ABC\u00A1\u2200XYZ"
789 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
790
791 reader = codecs.getreader("utf-8-sig")
792 for sizehint in [None] + list(range(1, 11)) + \
793 [64, 128, 256, 512, 1024]:
794 istream = reader(io.BytesIO(bytestring))
795 ostream = io.StringIO()
796 while 1:
797 if sizehint is not None:
798 data = istream.read(sizehint)
799 else:
800 data = istream.read()
801
802 if not data:
803 break
804 ostream.write(data)
805
806 got = ostream.getvalue()
807 self.assertEqual(got, unistring)
808
809class EscapeDecodeTest(unittest.TestCase):
810 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200811 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000812
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200813 def test_raw(self):
814 for b in range(256):
815 if b != b'\\'[0]:
816 self.assertEqual(codecs.escape_decode(bytes([b]) + b'0'),
817 (bytes([b]) + b'0', 2))
818
819 def test_escape(self):
820 self.assertEqual(codecs.escape_decode(b"[\\\n]"), (b"[]", 4))
821 self.assertEqual(codecs.escape_decode(br'[\"]'), (b'["]', 4))
822 self.assertEqual(codecs.escape_decode(br"[\']"), (b"[']", 4))
823 self.assertEqual(codecs.escape_decode(br"[\\]"), (br"[\]", 4))
824 self.assertEqual(codecs.escape_decode(br"[\a]"), (b"[\x07]", 4))
825 self.assertEqual(codecs.escape_decode(br"[\b]"), (b"[\x08]", 4))
826 self.assertEqual(codecs.escape_decode(br"[\t]"), (b"[\x09]", 4))
827 self.assertEqual(codecs.escape_decode(br"[\n]"), (b"[\x0a]", 4))
828 self.assertEqual(codecs.escape_decode(br"[\v]"), (b"[\x0b]", 4))
829 self.assertEqual(codecs.escape_decode(br"[\f]"), (b"[\x0c]", 4))
830 self.assertEqual(codecs.escape_decode(br"[\r]"), (b"[\x0d]", 4))
831 self.assertEqual(codecs.escape_decode(br"[\7]"), (b"[\x07]", 4))
832 self.assertEqual(codecs.escape_decode(br"[\8]"), (br"[\8]", 4))
833 self.assertEqual(codecs.escape_decode(br"[\78]"), (b"[\x078]", 5))
834 self.assertEqual(codecs.escape_decode(br"[\41]"), (b"[!]", 5))
835 self.assertEqual(codecs.escape_decode(br"[\418]"), (b"[!8]", 6))
836 self.assertEqual(codecs.escape_decode(br"[\101]"), (b"[A]", 6))
837 self.assertEqual(codecs.escape_decode(br"[\1010]"), (b"[A0]", 7))
838 self.assertEqual(codecs.escape_decode(br"[\501]"), (b"[A]", 6))
839 self.assertEqual(codecs.escape_decode(br"[\x41]"), (b"[A]", 6))
840 self.assertEqual(codecs.escape_decode(br"[\X41]"), (br"[\X41]", 6))
841 self.assertEqual(codecs.escape_decode(br"[\x410]"), (b"[A0]", 7))
842 for b in range(256):
843 if b not in b'\n"\'\\abtnvfr01234567x':
844 self.assertEqual(codecs.escape_decode(b'\\' + bytes([b])),
845 (b'\\' + bytes([b]), 2))
846
847 def test_errors(self):
848 self.assertRaises(ValueError, codecs.escape_decode, br"\x")
849 self.assertRaises(ValueError, codecs.escape_decode, br"[\x]")
850 self.assertEqual(codecs.escape_decode(br"[\x]\x", "ignore"), (b"[]", 6))
851 self.assertEqual(codecs.escape_decode(br"[\x]\x", "replace"), (b"[?]?", 6))
852 self.assertRaises(ValueError, codecs.escape_decode, br"\x0")
853 self.assertRaises(ValueError, codecs.escape_decode, br"[\x0]")
854 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
855 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
856
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000857class RecodingTest(unittest.TestCase):
858 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000859 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000860 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000861 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000862 f2.close()
863 # Python used to crash on this at exit because of a refcount
864 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000865
Martin v. Löwis2548c732003-04-18 10:39:54 +0000866# From RFC 3492
867punycode_testcases = [
868 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000869 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
870 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000871 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000872 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000873 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000874 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000875 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000876 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000877 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000878 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000879 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
880 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
881 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000882 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000883 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
885 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
886 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000887 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000888 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000889 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000890 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
891 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
892 "\u0939\u0948\u0902",
893 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000894
895 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000896 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000897 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
898 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000899
900 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000901 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
902 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
903 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000904 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
905 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000906
907 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000908 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
909 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
910 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
911 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000912 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000913
914 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000915 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
916 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
917 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
918 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
919 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000920 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000921
922 # (K) Vietnamese:
923 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
924 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000925 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
926 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
927 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
928 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000929 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000930
Martin v. Löwis2548c732003-04-18 10:39:54 +0000931 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000932 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000933 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000934
Martin v. Löwis2548c732003-04-18 10:39:54 +0000935 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
937 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
938 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000939 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000940
941 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000942 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
943 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
944 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000945 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946
947 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000948 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000949 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950
951 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
953 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000954 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000955
956 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000957 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000958 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000959
960 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000961 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000962 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963
964 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000965 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
966 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000967 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000968 ]
969
970for i in punycode_testcases:
971 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000972 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000973
974class PunycodeTest(unittest.TestCase):
975 def test_encode(self):
976 for uni, puny in punycode_testcases:
977 # Need to convert both strings to lower case, since
978 # some of the extended encodings use upper case, but our
979 # code produces only lower case. Converting just puny to
980 # lower is also insufficient, since some of the input characters
981 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000982 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000983 str(uni.encode("punycode"), "ascii").lower(),
984 str(puny, "ascii").lower()
985 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986
987 def test_decode(self):
988 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000989 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000990 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000991 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000993class UnicodeInternalTest(unittest.TestCase):
994 def test_bug1251300(self):
995 # Decoding with unicode_internal used to not correctly handle "code
996 # points" above 0x10ffff on UCS-4 builds.
997 if sys.maxunicode > 0xffff:
998 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000999 (b"\x00\x10\xff\xff", "\U0010ffff"),
1000 (b"\x00\x00\x01\x01", "\U00000101"),
1001 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001002 ]
1003 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +00001004 b"\x7f\xff\xff\xff",
1005 b"\x80\x00\x00\x00",
1006 b"\x81\x00\x00\x00",
1007 b"\x00",
1008 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001009 ]
1010 for internal, uni in ok:
1011 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +00001012 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001013 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001014 for internal in not_ok:
1015 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +00001016 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001017 self.assertRaises(UnicodeDecodeError, internal.decode,
1018 "unicode_internal")
1019
1020 def test_decode_error_attributes(self):
1021 if sys.maxunicode > 0xffff:
1022 try:
Walter Dörwald092a2252007-06-07 11:26:16 +00001023 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +00001024 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001025 self.assertEqual("unicode_internal", ex.encoding)
1026 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1027 self.assertEqual(4, ex.start)
1028 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001029 else:
1030 self.fail()
1031
1032 def test_decode_callback(self):
1033 if sys.maxunicode > 0xffff:
1034 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1035 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +00001036 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +00001037 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1038 "ascii"),
1039 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001040 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001041
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001042 def test_encode_length(self):
1043 # Issue 3739
1044 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001045 self.assertEqual(encoder("a")[1], 1)
1046 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001047
Ezio Melottib3aedd42010-11-20 19:04:17 +00001048 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001049
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1051nameprep_tests = [
1052 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001053 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1054 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1055 b'\xb8\x8f\xef\xbb\xbf',
1056 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'CAFE',
1059 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.3 Case folding 8bit U+00DF (german sharp s).
1061 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001062 (b'\xc3\x9f',
1063 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001064 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001065 (b'\xc4\xb0',
1066 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001067 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001068 (b'\xc5\x83\xcd\xba',
1069 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001070 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1071 # XXX: skip this as it fails in UCS-2 mode
1072 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1073 # 'telc\xe2\x88\x95kg\xcf\x83'),
1074 (None, None),
1075 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1077 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xe1\xbe\xb7',
1080 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001081 # 3.9 Self-reverting case folding U+01F0 and normalization.
1082 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001083 (b'\xc7\xb0',
1084 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001086 (b'\xce\x90',
1087 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001088 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001089 (b'\xce\xb0',
1090 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001091 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001092 (b'\xe1\xba\x96',
1093 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001095 (b'\xe1\xbd\x96',
1096 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001097 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001098 (b' ',
1099 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001101 (b'\xc2\xa0',
1102 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001103 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001104 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001105 None),
1106 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001107 (b'\xe2\x80\x80',
1108 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001109 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001110 (b'\xe2\x80\x8b',
1111 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001112 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001113 (b'\xe3\x80\x80',
1114 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001116 (b'\x10\x7f',
1117 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001119 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120 None),
1121 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001122 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123 None),
1124 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001125 (b'\xef\xbb\xbf',
1126 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001127 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001128 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001129 None),
1130 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001131 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001132 None),
1133 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001134 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 None),
1136 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001137 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001138 None),
1139 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001140 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141 None),
1142 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001143 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144 None),
1145 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001146 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001147 None),
1148 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001149 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150 None),
1151 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001152 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001153 None),
1154 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 (b'\xcd\x81',
1156 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001158 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 None),
1160 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001161 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001162 None),
1163 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001164 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001165 None),
1166 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001167 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001168 None),
1169 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001170 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001171 None),
1172 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001173 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001174 None),
1175 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001176 (b'foo\xef\xb9\xb6bar',
1177 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001179 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001180 None),
1181 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001182 (b'\xd8\xa71\xd8\xa8',
1183 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001184 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001185 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001186 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001187 # None),
1188 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189 # 3.44 Larger test (shrinking).
1190 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001191 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1192 b'\xaa\xce\xb0\xe2\x80\x80',
1193 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001194 # 3.45 Larger test (expanding).
1195 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001196 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1197 b'\x80',
1198 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1199 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1200 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001201 ]
1202
1203
1204class NameprepTest(unittest.TestCase):
1205 def test_nameprep(self):
1206 from encodings.idna import nameprep
1207 for pos, (orig, prepped) in enumerate(nameprep_tests):
1208 if orig is None:
1209 # Skipped
1210 continue
1211 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001212 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 if prepped is None:
1214 # Input contains prohibited characters
1215 self.assertRaises(UnicodeError, nameprep, orig)
1216 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001217 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001219 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001220 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001221 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001223class IDNACodecTest(unittest.TestCase):
1224 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001225 self.assertEqual(str(b"python.org", "idna"), "python.org")
1226 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1227 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1228 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001229
1230 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001231 self.assertEqual("python.org".encode("idna"), b"python.org")
1232 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1233 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1234 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001235
Martin v. Löwis8b595142005-08-25 11:03:38 +00001236 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001237 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001238 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001239 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001240
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001241 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001242 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001243 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244 "python.org"
1245 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001246 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001247 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001248 "python.org."
1249 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001250 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001251 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001252 "pyth\xf6n.org."
1253 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001254 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001255 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001256 "pyth\xf6n.org."
1257 )
1258
1259 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001260 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1261 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1262 self.assertEqual(decoder.decode(b"rg"), "")
1263 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001264
1265 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001266 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1267 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1268 self.assertEqual(decoder.decode(b"rg."), "org.")
1269 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001270
1271 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001272 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001273 b"".join(codecs.iterencode("python.org", "idna")),
1274 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001275 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001276 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001277 b"".join(codecs.iterencode("python.org.", "idna")),
1278 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001279 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001280 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001281 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1282 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001283 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001284 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1286 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001287 )
1288
1289 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001290 self.assertEqual(encoder.encode("\xe4x"), b"")
1291 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1292 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001293
1294 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001295 self.assertEqual(encoder.encode("\xe4x"), b"")
1296 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1297 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001298
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001299class CodecsModuleTest(unittest.TestCase):
1300
1301 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001302 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1303 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001304 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001305 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001306 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001307
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001308 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001309 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1310 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001311 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001312 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001313 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001315
1316 def test_register(self):
1317 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001318 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001319
1320 def test_lookup(self):
1321 self.assertRaises(TypeError, codecs.lookup)
1322 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001323 self.assertRaises(LookupError, codecs.lookup, " ")
1324
1325 def test_getencoder(self):
1326 self.assertRaises(TypeError, codecs.getencoder)
1327 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1328
1329 def test_getdecoder(self):
1330 self.assertRaises(TypeError, codecs.getdecoder)
1331 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1332
1333 def test_getreader(self):
1334 self.assertRaises(TypeError, codecs.getreader)
1335 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1336
1337 def test_getwriter(self):
1338 self.assertRaises(TypeError, codecs.getwriter)
1339 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001340
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001341 def test_lookup_issue1813(self):
1342 # Issue #1813: under Turkish locales, lookup of some codecs failed
1343 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001344 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001345 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1346 try:
1347 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1348 except locale.Error:
1349 # Unsupported locale on this system
1350 self.skipTest('test needs Turkish locale')
1351 c = codecs.lookup('ASCII')
1352 self.assertEqual(c.name, 'ascii')
1353
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001354class StreamReaderTest(unittest.TestCase):
1355
1356 def setUp(self):
1357 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001358 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001359
1360 def test_readlines(self):
1361 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001362 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001363
Thomas Wouters89f507f2006-12-13 04:49:30 +00001364class EncodedFileTest(unittest.TestCase):
1365
1366 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001367 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001368 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001369 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001370
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001371 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001372 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001373 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001374 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001375
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001376all_unicode_encodings = [
1377 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001378 "big5",
1379 "big5hkscs",
1380 "charmap",
1381 "cp037",
1382 "cp1006",
1383 "cp1026",
1384 "cp1140",
1385 "cp1250",
1386 "cp1251",
1387 "cp1252",
1388 "cp1253",
1389 "cp1254",
1390 "cp1255",
1391 "cp1256",
1392 "cp1257",
1393 "cp1258",
1394 "cp424",
1395 "cp437",
1396 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001397 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001398 "cp737",
1399 "cp775",
1400 "cp850",
1401 "cp852",
1402 "cp855",
1403 "cp856",
1404 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001405 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001406 "cp860",
1407 "cp861",
1408 "cp862",
1409 "cp863",
1410 "cp864",
1411 "cp865",
1412 "cp866",
1413 "cp869",
1414 "cp874",
1415 "cp875",
1416 "cp932",
1417 "cp949",
1418 "cp950",
1419 "euc_jis_2004",
1420 "euc_jisx0213",
1421 "euc_jp",
1422 "euc_kr",
1423 "gb18030",
1424 "gb2312",
1425 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001426 "hp_roman8",
1427 "hz",
1428 "idna",
1429 "iso2022_jp",
1430 "iso2022_jp_1",
1431 "iso2022_jp_2",
1432 "iso2022_jp_2004",
1433 "iso2022_jp_3",
1434 "iso2022_jp_ext",
1435 "iso2022_kr",
1436 "iso8859_1",
1437 "iso8859_10",
1438 "iso8859_11",
1439 "iso8859_13",
1440 "iso8859_14",
1441 "iso8859_15",
1442 "iso8859_16",
1443 "iso8859_2",
1444 "iso8859_3",
1445 "iso8859_4",
1446 "iso8859_5",
1447 "iso8859_6",
1448 "iso8859_7",
1449 "iso8859_8",
1450 "iso8859_9",
1451 "johab",
1452 "koi8_r",
1453 "koi8_u",
1454 "latin_1",
1455 "mac_cyrillic",
1456 "mac_greek",
1457 "mac_iceland",
1458 "mac_latin2",
1459 "mac_roman",
1460 "mac_turkish",
1461 "palmos",
1462 "ptcp154",
1463 "punycode",
1464 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001465 "shift_jis",
1466 "shift_jis_2004",
1467 "shift_jisx0213",
1468 "tis_620",
1469 "unicode_escape",
1470 "unicode_internal",
1471 "utf_16",
1472 "utf_16_be",
1473 "utf_16_le",
1474 "utf_7",
1475 "utf_8",
1476]
1477
1478if hasattr(codecs, "mbcs_encode"):
1479 all_unicode_encodings.append("mbcs")
1480
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001481# The following encoding is not tested, because it's not supposed
1482# to work:
1483# "undefined"
1484
1485# The following encodings don't work in stateful mode
1486broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001487 "punycode",
1488 "unicode_internal"
1489]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001490broken_incremental_coders = broken_unicode_with_streams + [
1491 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001492]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001493
Walter Dörwald3abcb012007-04-16 22:10:50 +00001494class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001495 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001496 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001497 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001498 name = codecs.lookup(encoding).name
1499 if encoding.endswith("_codec"):
1500 name += "_codec"
1501 elif encoding == "latin_1":
1502 name = "latin_1"
1503 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001504 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001505 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001506 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001507 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1508
1509 if encoding not in broken_unicode_with_streams:
1510 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001511 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001512 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001513 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001514 for c in s:
1515 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001516 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001517 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001518 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001519 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001520 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001521 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001522 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001523 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001524 decodedresult += reader.read()
1525 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1526
Thomas Wouters89f507f2006-12-13 04:49:30 +00001527 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001528 # check incremental decoder/encoder (fetched via the Python
1529 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001530 try:
1531 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001532 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001533 except LookupError: # no IncrementalEncoder
1534 pass
1535 else:
1536 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001537 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001538 for c in s:
1539 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001540 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001541 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001542 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001543 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001544 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001545 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001546 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1547
1548 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001549 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550 for c in s:
1551 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001552 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001553 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001554 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001555 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001556 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001557 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001558 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1559
1560 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001561 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001562 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1563
1564 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1566 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001567
Victor Stinner554f3f02010-06-16 23:33:54 +00001568 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001569 # check incremental decoder/encoder with errors argument
1570 try:
1571 encoder = codecs.getincrementalencoder(encoding)("ignore")
1572 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1573 except LookupError: # no IncrementalEncoder
1574 pass
1575 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001576 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001577 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001578 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001579 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1580
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001581 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001582 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001583 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001584 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1585
Walter Dörwald729c31f2005-03-14 19:06:30 +00001586 def test_seek(self):
1587 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001588 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001589 for encoding in all_unicode_encodings:
1590 if encoding == "idna": # FIXME: See SF bug #1163178
1591 continue
1592 if encoding in broken_unicode_with_streams:
1593 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001594 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001595 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001596 # Test that calling seek resets the internal codec state and buffers
1597 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001598 data = reader.read()
1599 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001600
Walter Dörwalde22d3392005-11-17 08:52:34 +00001601 def test_bad_decode_args(self):
1602 for encoding in all_unicode_encodings:
1603 decoder = codecs.getdecoder(encoding)
1604 self.assertRaises(TypeError, decoder)
1605 if encoding not in ("idna", "punycode"):
1606 self.assertRaises(TypeError, decoder, 42)
1607
1608 def test_bad_encode_args(self):
1609 for encoding in all_unicode_encodings:
1610 encoder = codecs.getencoder(encoding)
1611 self.assertRaises(TypeError, encoder)
1612
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001613 def test_encoding_map_type_initialized(self):
1614 from encodings import cp1140
1615 # This used to crash, we are only verifying there's no crash.
1616 table_type = type(cp1140.encoding_table)
1617 self.assertEqual(table_type, table_type)
1618
Walter Dörwald3abcb012007-04-16 22:10:50 +00001619 def test_decoder_state(self):
1620 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001621 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001622 for encoding in all_unicode_encodings:
1623 if encoding not in broken_incremental_coders:
1624 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1625 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1626
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001627class CharmapTest(unittest.TestCase):
1628 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001629 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001630 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001631 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001632 )
1633
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001634 self.assertRaises(UnicodeDecodeError,
1635 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1636 )
1637
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001638 self.assertRaises(UnicodeDecodeError,
1639 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1640 )
1641
Ezio Melottib3aedd42010-11-20 19:04:17 +00001642 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001643 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001644 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001645 )
1646
Ezio Melottib3aedd42010-11-20 19:04:17 +00001647 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001648 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001650 )
1651
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001653 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001654 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001655 )
1656
Ezio Melottib3aedd42010-11-20 19:04:17 +00001657 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001658 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001659 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001660 )
1661
Guido van Rossum805365e2007-05-07 22:24:25 +00001662 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001663 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001664 codecs.charmap_decode(allbytes, "ignore", ""),
1665 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001666 )
1667
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001668 def test_decode_with_int2str_map(self):
1669 self.assertEqual(
1670 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1671 {0: 'a', 1: 'b', 2: 'c'}),
1672 ("abc", 3)
1673 )
1674
1675 self.assertEqual(
1676 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1677 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1678 ("AaBbCc", 3)
1679 )
1680
1681 self.assertEqual(
1682 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1683 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1684 ("\U0010FFFFbc", 3)
1685 )
1686
1687 self.assertEqual(
1688 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1689 {0: 'a', 1: 'b', 2: ''}),
1690 ("ab", 3)
1691 )
1692
1693 self.assertRaises(UnicodeDecodeError,
1694 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1695 {0: 'a', 1: 'b'}
1696 )
1697
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001698 self.assertRaises(UnicodeDecodeError,
1699 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1700 {0: 'a', 1: 'b', 2: None}
1701 )
1702
1703 # Issue #14850
1704 self.assertRaises(UnicodeDecodeError,
1705 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1706 {0: 'a', 1: 'b', 2: '\ufffe'}
1707 )
1708
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001709 self.assertEqual(
1710 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1711 {0: 'a', 1: 'b'}),
1712 ("ab\ufffd", 3)
1713 )
1714
1715 self.assertEqual(
1716 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1717 {0: 'a', 1: 'b', 2: None}),
1718 ("ab\ufffd", 3)
1719 )
1720
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001721 # Issue #14850
1722 self.assertEqual(
1723 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1724 {0: 'a', 1: 'b', 2: '\ufffe'}),
1725 ("ab\ufffd", 3)
1726 )
1727
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001728 self.assertEqual(
1729 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1730 {0: 'a', 1: 'b'}),
1731 ("ab", 3)
1732 )
1733
1734 self.assertEqual(
1735 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1736 {0: 'a', 1: 'b', 2: None}),
1737 ("ab", 3)
1738 )
1739
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001740 # Issue #14850
1741 self.assertEqual(
1742 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1743 {0: 'a', 1: 'b', 2: '\ufffe'}),
1744 ("ab", 3)
1745 )
1746
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001747 allbytes = bytes(range(256))
1748 self.assertEqual(
1749 codecs.charmap_decode(allbytes, "ignore", {}),
1750 ("", len(allbytes))
1751 )
1752
1753 def test_decode_with_int2int_map(self):
1754 a = ord('a')
1755 b = ord('b')
1756 c = ord('c')
1757
1758 self.assertEqual(
1759 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1760 {0: a, 1: b, 2: c}),
1761 ("abc", 3)
1762 )
1763
1764 # Issue #15379
1765 self.assertEqual(
1766 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1767 {0: 0x10FFFF, 1: b, 2: c}),
1768 ("\U0010FFFFbc", 3)
1769 )
1770
1771 self.assertRaises(TypeError,
1772 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1773 {0: 0x110000, 1: b, 2: c}
1774 )
1775
1776 self.assertRaises(UnicodeDecodeError,
1777 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1778 {0: a, 1: b},
1779 )
1780
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001781 self.assertRaises(UnicodeDecodeError,
1782 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1783 {0: a, 1: b, 2: 0xFFFE},
1784 )
1785
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001786 self.assertEqual(
1787 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1788 {0: a, 1: b}),
1789 ("ab\ufffd", 3)
1790 )
1791
1792 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001793 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1794 {0: a, 1: b, 2: 0xFFFE}),
1795 ("ab\ufffd", 3)
1796 )
1797
1798 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001799 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1800 {0: a, 1: b}),
1801 ("ab", 3)
1802 )
1803
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001804 self.assertEqual(
1805 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1806 {0: a, 1: b, 2: 0xFFFE}),
1807 ("ab", 3)
1808 )
1809
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001810
Thomas Wouters89f507f2006-12-13 04:49:30 +00001811class WithStmtTest(unittest.TestCase):
1812 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001813 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001814 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001815 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001816
1817 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001818 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001819 info = codecs.lookup("utf-8")
1820 with codecs.StreamReaderWriter(f, info.streamreader,
1821 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001822 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001823
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001824class TypesTest(unittest.TestCase):
1825 def test_decode_unicode(self):
1826 # Most decoders don't accept unicode input
1827 decoders = [
1828 codecs.utf_7_decode,
1829 codecs.utf_8_decode,
1830 codecs.utf_16_le_decode,
1831 codecs.utf_16_be_decode,
1832 codecs.utf_16_ex_decode,
1833 codecs.utf_32_decode,
1834 codecs.utf_32_le_decode,
1835 codecs.utf_32_be_decode,
1836 codecs.utf_32_ex_decode,
1837 codecs.latin_1_decode,
1838 codecs.ascii_decode,
1839 codecs.charmap_decode,
1840 ]
1841 if hasattr(codecs, "mbcs_decode"):
1842 decoders.append(codecs.mbcs_decode)
1843 for decoder in decoders:
1844 self.assertRaises(TypeError, decoder, "xxx")
1845
1846 def test_unicode_escape(self):
1847 # Escape-decoding an unicode string is supported ang gives the same
1848 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001849 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1850 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1851 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1852 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001853
Serhiy Storchakad6793772013-01-29 10:20:44 +02001854
1855class UnicodeEscapeTest(unittest.TestCase):
1856 def test_empty(self):
1857 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
1858 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
1859
1860 def test_raw_encode(self):
1861 encode = codecs.unicode_escape_encode
1862 for b in range(32, 127):
1863 if b != b'\\'[0]:
1864 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
1865
1866 def test_raw_decode(self):
1867 decode = codecs.unicode_escape_decode
1868 for b in range(256):
1869 if b != b'\\'[0]:
1870 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
1871
1872 def test_escape_encode(self):
1873 encode = codecs.unicode_escape_encode
1874 check = coding_checker(self, encode)
1875 check('\t', br'\t')
1876 check('\n', br'\n')
1877 check('\r', br'\r')
1878 check('\\', br'\\')
1879 for b in range(32):
1880 if chr(b) not in '\t\n\r':
1881 check(chr(b), ('\\x%02x' % b).encode())
1882 for b in range(127, 256):
1883 check(chr(b), ('\\x%02x' % b).encode())
1884 check('\u20ac', br'\u20ac')
1885 check('\U0001d120', br'\U0001d120')
1886
1887 def test_escape_decode(self):
1888 decode = codecs.unicode_escape_decode
1889 check = coding_checker(self, decode)
1890 check(b"[\\\n]", "[]")
1891 check(br'[\"]', '["]')
1892 check(br"[\']", "[']")
1893 check(br"[\\]", r"[\]")
1894 check(br"[\a]", "[\x07]")
1895 check(br"[\b]", "[\x08]")
1896 check(br"[\t]", "[\x09]")
1897 check(br"[\n]", "[\x0a]")
1898 check(br"[\v]", "[\x0b]")
1899 check(br"[\f]", "[\x0c]")
1900 check(br"[\r]", "[\x0d]")
1901 check(br"[\7]", "[\x07]")
1902 check(br"[\8]", r"[\8]")
1903 check(br"[\78]", "[\x078]")
1904 check(br"[\41]", "[!]")
1905 check(br"[\418]", "[!8]")
1906 check(br"[\101]", "[A]")
1907 check(br"[\1010]", "[A0]")
1908 check(br"[\x41]", "[A]")
1909 check(br"[\x410]", "[A0]")
1910 check(br"\u20ac", "\u20ac")
1911 check(br"\U0001d120", "\U0001d120")
1912 for b in range(256):
1913 if b not in b'\n"\'\\abtnvfr01234567xuUN':
1914 check(b'\\' + bytes([b]), '\\' + chr(b))
1915
1916 def test_decode_errors(self):
1917 decode = codecs.unicode_escape_decode
1918 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
1919 for i in range(d):
1920 self.assertRaises(UnicodeDecodeError, decode,
1921 b"\\" + c + b"0"*i)
1922 self.assertRaises(UnicodeDecodeError, decode,
1923 b"[\\" + c + b"0"*i + b"]")
1924 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
1925 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
1926 self.assertEqual(decode(data, "replace"),
1927 ("[\ufffd]\ufffd", len(data)))
1928 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
1929 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
1930 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
1931
1932
Martin v. Löwis43c57782009-05-10 08:15:24 +00001933class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001934
1935 def test_utf8(self):
1936 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001937 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001938 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001939 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001940 b"foo\x80bar")
1941 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001942 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001943 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001944 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001945 b"\xed\xb0\x80")
1946
1947 def test_ascii(self):
1948 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001949 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001950 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001951 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001952 b"foo\x80bar")
1953
1954 def test_charmap(self):
1955 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001956 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001957 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001958 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001959 b"foo\xa5bar")
1960
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001961 def test_latin1(self):
1962 # Issue6373
1963 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1964 b"\xe4\xeb\xef\xf6\xfc")
1965
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001966
Victor Stinner3fed0872010-05-22 02:16:27 +00001967class BomTest(unittest.TestCase):
1968 def test_seek0(self):
1969 data = "1234567890"
1970 tests = ("utf-16",
1971 "utf-16-le",
1972 "utf-16-be",
1973 "utf-32",
1974 "utf-32-le",
1975 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001976 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001977 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001978 # Check if the BOM is written only once
1979 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001980 f.write(data)
1981 f.write(data)
1982 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001983 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001984 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001985 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001986
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001987 # Check that the BOM is written after a seek(0)
1988 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1989 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001990 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001991 f.seek(0)
1992 f.write(data)
1993 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001994 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001995
1996 # (StreamWriter) Check that the BOM is written after a seek(0)
1997 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1998 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001999 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002000 f.writer.seek(0)
2001 f.writer.write(data)
2002 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002003 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002004
2005 # Check that the BOM is not written after a seek() at a position
2006 # different than the start
2007 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2008 f.write(data)
2009 f.seek(f.tell())
2010 f.write(data)
2011 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002012 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002013
2014 # (StreamWriter) Check that the BOM is not written after a seek()
2015 # at a position different than the start
2016 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2017 f.writer.write(data)
2018 f.writer.seek(f.writer.tell())
2019 f.writer.write(data)
2020 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002021 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002022
Victor Stinner3fed0872010-05-22 02:16:27 +00002023
Georg Brandl02524622010-12-02 18:06:51 +00002024bytes_transform_encodings = [
2025 "base64_codec",
2026 "uu_codec",
2027 "quopri_codec",
2028 "hex_codec",
2029]
2030try:
2031 import zlib
2032except ImportError:
2033 pass
2034else:
2035 bytes_transform_encodings.append("zlib_codec")
2036try:
2037 import bz2
2038except ImportError:
2039 pass
2040else:
2041 bytes_transform_encodings.append("bz2_codec")
2042
2043class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002044
Georg Brandl02524622010-12-02 18:06:51 +00002045 def test_basics(self):
2046 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002047 for encoding in bytes_transform_encodings:
2048 # generic codecs interface
2049 (o, size) = codecs.getencoder(encoding)(binput)
2050 self.assertEqual(size, len(binput))
2051 (i, size) = codecs.getdecoder(encoding)(o)
2052 self.assertEqual(size, len(o))
2053 self.assertEqual(i, binput)
2054
Georg Brandl02524622010-12-02 18:06:51 +00002055 def test_read(self):
2056 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002057 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00002058 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2059 sout = reader.read()
2060 self.assertEqual(sout, b"\x80")
2061
2062 def test_readline(self):
2063 for encoding in bytes_transform_encodings:
2064 if encoding in ['uu_codec', 'zlib_codec']:
2065 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002066 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00002067 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2068 sout = reader.readline()
2069 self.assertEqual(sout, b"\x80")
2070
2071
Fred Drake2e2be372001-09-20 21:33:42 +00002072def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002073 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002074 UTF32Test,
2075 UTF32LETest,
2076 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002077 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002078 UTF16LETest,
2079 UTF16BETest,
2080 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002081 UTF8SigTest,
Ezio Melotti26ed2342013-01-11 05:54:57 +02002082 EscapeDecodeTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002083 UTF7Test,
2084 UTF16ExTest,
2085 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002086 RecodingTest,
2087 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002088 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002089 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002090 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002091 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002092 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002093 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002094 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002095 CharmapTest,
2096 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002097 TypesTest,
Serhiy Storchakad6793772013-01-29 10:20:44 +02002098 UnicodeEscapeTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002099 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002100 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002101 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002102 )
Fred Drake2e2be372001-09-20 21:33:42 +00002103
2104
2105if __name__ == "__main__":
2106 test_main()