blob: 749f415ba6b2cbf42bffa6079e938b4723e6a7e6 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakad6793772013-01-29 10:20:44 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000016 def __init__(self, buffer):
17 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000018
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000025 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000026 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwald3abcb012007-04-16 22:10:50 +000032class MixInCheckStateHandling:
33 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000034 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000035 d = codecs.getincrementaldecoder(encoding)()
36 part1 = d.decode(s[:i])
37 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000038 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000039 # Check that the condition stated in the documentation for
40 # IncrementalDecoder.getstate() holds
41 if not state[1]:
42 # reset decoder to the default state without anything buffered
43 d.setstate((state[0][:0], 0))
44 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000045 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000046 # The decoder must return to the same state
47 self.assertEqual(state, d.getstate())
48 # Create a new decoder and set it to the state
49 # we extracted from the old one
50 d = codecs.getincrementaldecoder(encoding)()
51 d.setstate(state)
52 part2 = d.decode(s[i:], True)
53 self.assertEqual(u, part1+part2)
54
55 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000056 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000057 d = codecs.getincrementalencoder(encoding)()
58 part1 = d.encode(u[:i])
59 state = d.getstate()
60 d = codecs.getincrementalencoder(encoding)()
61 d.setstate(state)
62 part2 = d.encode(u[i:], True)
63 self.assertEqual(s, part1+part2)
64
65class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000066 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000067 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000068 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000069 # the StreamReader and check that the results equal the appropriate
70 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000071 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000073 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000074 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000076 result += r.read()
77 self.assertEqual(result, partialresult)
78 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000080 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000081
Thomas Woutersa9773292006-04-21 09:43:23 +000082 # do the check again, this time using a incremental decoder
83 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000084 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000085 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000086 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000087 self.assertEqual(result, partialresult)
88 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000089 self.assertEqual(d.decode(b"", True), "")
90 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000091
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000093 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000094 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000095 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000096 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000097 self.assertEqual(result, partialresult)
98 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000099 self.assertEqual(d.decode(b"", True), "")
100 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000101
102 # check iterdecode()
103 encoded = input.encode(self.encoding)
104 self.assertEqual(
105 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000106 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 )
108
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109 def test_readline(self):
110 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000111 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000112 return codecs.getreader(self.encoding)(stream)
113
Walter Dörwaldca199432006-03-06 22:39:12 +0000114 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000115 reader = getreader(input)
116 lines = []
117 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000118 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000119 if not line:
120 break
121 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000123
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000124 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
125 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
126 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000127 self.assertEqual(readalllines(s, True), sexpected)
128 self.assertEqual(readalllines(s, False), sexpectednoends)
129 self.assertEqual(readalllines(s, True, 10), sexpected)
130 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131
132 # Test long lines (multiple calls to read() in readline())
133 vw = []
134 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000135 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
136 vw.append((i*200)*"\3042" + lineend)
137 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000138 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
139 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
140
141 # Test lines where the first read might end with \r, so the
142 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000143 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000144 for lineend in "\n \r\n \r \u2028".split():
145 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000147 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000148 self.assertEqual(
149 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000150 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000151 )
152 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000153 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000154 self.assertEqual(
155 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000157 )
158
159 def test_bug1175396(self):
160 s = [
161 '<%!--===================================================\r\n',
162 ' BLOG index page: show recent articles,\r\n',
163 ' today\'s articles, or articles of a specific date.\r\n',
164 '========================================================--%>\r\n',
165 '<%@inputencoding="ISO-8859-1"%>\r\n',
166 '<%@pagetemplate=TEMPLATE.y%>\r\n',
167 '<%@import=import frog.util, frog%>\r\n',
168 '<%@import=import frog.objects%>\r\n',
169 '<%@import=from frog.storageerrors import StorageError%>\r\n',
170 '<%\r\n',
171 '\r\n',
172 'import logging\r\n',
173 'log=logging.getLogger("Snakelets.logger")\r\n',
174 '\r\n',
175 '\r\n',
176 'user=self.SessionCtx.user\r\n',
177 'storageEngine=self.SessionCtx.storageEngine\r\n',
178 '\r\n',
179 '\r\n',
180 'def readArticlesFromDate(date, count=None):\r\n',
181 ' entryids=storageEngine.listBlogEntries(date)\r\n',
182 ' entryids.reverse() # descending\r\n',
183 ' if count:\r\n',
184 ' entryids=entryids[:count]\r\n',
185 ' try:\r\n',
186 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
187 ' except StorageError,x:\r\n',
188 ' log.error("Error loading articles: "+str(x))\r\n',
189 ' self.abort("cannot load articles")\r\n',
190 '\r\n',
191 'showdate=None\r\n',
192 '\r\n',
193 'arg=self.Request.getArg()\r\n',
194 'if arg=="today":\r\n',
195 ' #-------------------- TODAY\'S ARTICLES\r\n',
196 ' self.write("<h2>Today\'s articles</h2>")\r\n',
197 ' showdate = frog.util.isodatestr() \r\n',
198 ' entries = readArticlesFromDate(showdate)\r\n',
199 'elif arg=="active":\r\n',
200 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
201 ' self.Yredirect("active.y")\r\n',
202 'elif arg=="login":\r\n',
203 ' #-------------------- LOGIN PAGE redirect\r\n',
204 ' self.Yredirect("login.y")\r\n',
205 'elif arg=="date":\r\n',
206 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
207 ' showdate = self.Request.getParameter("date")\r\n',
208 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'else:\r\n',
211 ' #-------------------- RECENT ARTICLES\r\n',
212 ' self.write("<h2>Recent articles</h2>")\r\n',
213 ' dates=storageEngine.listBlogEntryDates()\r\n',
214 ' if dates:\r\n',
215 ' entries=[]\r\n',
216 ' SHOWAMOUNT=10\r\n',
217 ' for showdate in dates:\r\n',
218 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
219 ' if len(entries)>=SHOWAMOUNT:\r\n',
220 ' break\r\n',
221 ' \r\n',
222 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000223 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000224 reader = codecs.getreader(self.encoding)(stream)
225 for (i, line) in enumerate(reader):
226 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000227
228 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000229 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000230 writer = codecs.getwriter(self.encoding)(q)
231 reader = codecs.getreader(self.encoding)(q)
232
233 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000234 writer.write("foo\r")
235 self.assertEqual(reader.readline(keepends=False), "foo")
236 writer.write("\nbar\r")
237 self.assertEqual(reader.readline(keepends=False), "")
238 self.assertEqual(reader.readline(keepends=False), "bar")
239 writer.write("baz")
240 self.assertEqual(reader.readline(keepends=False), "baz")
241 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000242
243 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000244 writer.write("foo\r")
245 self.assertEqual(reader.readline(keepends=True), "foo\r")
246 writer.write("\nbar\r")
247 self.assertEqual(reader.readline(keepends=True), "\n")
248 self.assertEqual(reader.readline(keepends=True), "bar\r")
249 writer.write("baz")
250 self.assertEqual(reader.readline(keepends=True), "baz")
251 self.assertEqual(reader.readline(keepends=True), "")
252 writer.write("foo\r\n")
253 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
Walter Dörwald9fa09462005-01-10 12:01:39 +0000255 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
257 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
258 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000259
260 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000261 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262 reader = codecs.getreader(self.encoding)(stream)
263 self.assertEqual(reader.readline(), s1)
264 self.assertEqual(reader.readline(), s2)
265 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000266 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267
268 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000269 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
270 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
271 s3 = "stillokay:bbbbxx\r\n"
272 s4 = "broken!!!!badbad\r\n"
273 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000274
275 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000276 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000277 reader = codecs.getreader(self.encoding)(stream)
278 self.assertEqual(reader.readline(), s1)
279 self.assertEqual(reader.readline(), s2)
280 self.assertEqual(reader.readline(), s3)
281 self.assertEqual(reader.readline(), s4)
282 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000283 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000284
Walter Dörwald41980ca2007-08-16 21:55:45 +0000285class UTF32Test(ReadTest):
286 encoding = "utf-32"
287
288 spamle = (b'\xff\xfe\x00\x00'
289 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
290 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
291 spambe = (b'\x00\x00\xfe\xff'
292 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
293 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
294
295 def test_only_one_bom(self):
296 _,_,reader,writer = codecs.lookup(self.encoding)
297 # encode some stream
298 s = io.BytesIO()
299 f = writer(s)
300 f.write("spam")
301 f.write("spam")
302 d = s.getvalue()
303 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000304 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000305 # try to read it back
306 s = io.BytesIO(d)
307 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000308 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000309
310 def test_badbom(self):
311 s = io.BytesIO(4*b"\xff")
312 f = codecs.getreader(self.encoding)(s)
313 self.assertRaises(UnicodeError, f.read)
314
315 s = io.BytesIO(8*b"\xff")
316 f = codecs.getreader(self.encoding)(s)
317 self.assertRaises(UnicodeError, f.read)
318
319 def test_partial(self):
320 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200321 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000322 [
323 "", # first byte of BOM read
324 "", # second byte of BOM read
325 "", # third byte of BOM read
326 "", # fourth byte of BOM read => byteorder known
327 "",
328 "",
329 "",
330 "\x00",
331 "\x00",
332 "\x00",
333 "\x00",
334 "\x00\xff",
335 "\x00\xff",
336 "\x00\xff",
337 "\x00\xff",
338 "\x00\xff\u0100",
339 "\x00\xff\u0100",
340 "\x00\xff\u0100",
341 "\x00\xff\u0100",
342 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200343 "\x00\xff\u0100\uffff",
344 "\x00\xff\u0100\uffff",
345 "\x00\xff\u0100\uffff",
346 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000347 ]
348 )
349
Georg Brandl791f4e12009-09-17 11:41:24 +0000350 def test_handlers(self):
351 self.assertEqual(('\ufffd', 1),
352 codecs.utf_32_decode(b'\x01', 'replace', True))
353 self.assertEqual(('', 1),
354 codecs.utf_32_decode(b'\x01', 'ignore', True))
355
Walter Dörwald41980ca2007-08-16 21:55:45 +0000356 def test_errors(self):
357 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
358 b"\xff", "strict", True)
359
360 def test_decoder_state(self):
361 self.check_state_handling_decode(self.encoding,
362 "spamspam", self.spamle)
363 self.check_state_handling_decode(self.encoding,
364 "spamspam", self.spambe)
365
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000366 def test_issue8941(self):
367 # Issue #8941: insufficient result allocation when decoding into
368 # surrogate pairs on UCS-2 builds.
369 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
370 self.assertEqual('\U00010000' * 1024,
371 codecs.utf_32_decode(encoded_le)[0])
372 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
373 self.assertEqual('\U00010000' * 1024,
374 codecs.utf_32_decode(encoded_be)[0])
375
Walter Dörwald41980ca2007-08-16 21:55:45 +0000376class UTF32LETest(ReadTest):
377 encoding = "utf-32-le"
378
379 def test_partial(self):
380 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200381 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000382 [
383 "",
384 "",
385 "",
386 "\x00",
387 "\x00",
388 "\x00",
389 "\x00",
390 "\x00\xff",
391 "\x00\xff",
392 "\x00\xff",
393 "\x00\xff",
394 "\x00\xff\u0100",
395 "\x00\xff\u0100",
396 "\x00\xff\u0100",
397 "\x00\xff\u0100",
398 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200399 "\x00\xff\u0100\uffff",
400 "\x00\xff\u0100\uffff",
401 "\x00\xff\u0100\uffff",
402 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000403 ]
404 )
405
406 def test_simple(self):
407 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
408
409 def test_errors(self):
410 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
411 b"\xff", "strict", True)
412
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000413 def test_issue8941(self):
414 # Issue #8941: insufficient result allocation when decoding into
415 # surrogate pairs on UCS-2 builds.
416 encoded = b'\x00\x00\x01\x00' * 1024
417 self.assertEqual('\U00010000' * 1024,
418 codecs.utf_32_le_decode(encoded)[0])
419
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420class UTF32BETest(ReadTest):
421 encoding = "utf-32-be"
422
423 def test_partial(self):
424 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200425 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000426 [
427 "",
428 "",
429 "",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200443 "\x00\xff\u0100\uffff",
444 "\x00\xff\u0100\uffff",
445 "\x00\xff\u0100\uffff",
446 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000447 ]
448 )
449
450 def test_simple(self):
451 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
452
453 def test_errors(self):
454 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
455 b"\xff", "strict", True)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded = b'\x00\x01\x00\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_be_decode(encoded)[0])
463
464
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000465class UTF16Test(ReadTest):
466 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000468 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
469 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470
471 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000475 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 f.write("spam")
477 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 d = s.getvalue()
479 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000480 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000483 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000484 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000486 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000492 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000493 self.assertRaises(UnicodeError, f.read)
494
Walter Dörwald69652032004-09-07 20:24:22 +0000495 def test_partial(self):
496 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200497 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000498 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000499 "", # first byte of BOM read
500 "", # second byte of BOM read => byteorder known
501 "",
502 "\x00",
503 "\x00",
504 "\x00\xff",
505 "\x00\xff",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200509 "\x00\xff\u0100\uffff",
510 "\x00\xff\u0100\uffff",
511 "\x00\xff\u0100\uffff",
512 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000513 ]
514 )
515
Georg Brandl791f4e12009-09-17 11:41:24 +0000516 def test_handlers(self):
517 self.assertEqual(('\ufffd', 1),
518 codecs.utf_16_decode(b'\x01', 'replace', True))
519 self.assertEqual(('', 1),
520 codecs.utf_16_decode(b'\x01', 'ignore', True))
521
Walter Dörwalde22d3392005-11-17 08:52:34 +0000522 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000523 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000524 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000525
526 def test_decoder_state(self):
527 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000528 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000529 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000530 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000531
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000532 def test_bug691291(self):
533 # Files are always opened in binary mode, even if no binary mode was
534 # specified. This means that no automatic conversion of '\n' is done
535 # on reading and writing.
536 s1 = 'Hello\r\nworld\r\n'
537
538 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200539 self.addCleanup(support.unlink, support.TESTFN)
540 with open(support.TESTFN, 'wb') as fp:
541 fp.write(s)
542 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
543 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000544
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545class UTF16LETest(ReadTest):
546 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000547
548 def test_partial(self):
549 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200550 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000551 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000552 "",
553 "\x00",
554 "\x00",
555 "\x00\xff",
556 "\x00\xff",
557 "\x00\xff\u0100",
558 "\x00\xff\u0100",
559 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200560 "\x00\xff\u0100\uffff",
561 "\x00\xff\u0100\uffff",
562 "\x00\xff\u0100\uffff",
563 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000564 ]
565 )
566
Walter Dörwalde22d3392005-11-17 08:52:34 +0000567 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200568 tests = [
569 (b'\xff', '\ufffd'),
570 (b'A\x00Z', 'A\ufffd'),
571 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
572 (b'\x00\xd8', '\ufffd'),
573 (b'\x00\xd8A', '\ufffd'),
574 (b'\x00\xd8A\x00', '\ufffdA'),
575 (b'\x00\xdcA\x00', '\ufffdA'),
576 ]
577 for raw, expected in tests:
578 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
579 raw, 'strict', True)
580 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000581
Victor Stinner53a9dd72010-12-08 22:25:45 +0000582 def test_nonbmp(self):
583 self.assertEqual("\U00010203".encode(self.encoding),
584 b'\x00\xd8\x03\xde')
585 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
586 "\U00010203")
587
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588class UTF16BETest(ReadTest):
589 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000590
591 def test_partial(self):
592 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200593 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000594 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000595 "",
596 "\x00",
597 "\x00",
598 "\x00\xff",
599 "\x00\xff",
600 "\x00\xff\u0100",
601 "\x00\xff\u0100",
602 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200603 "\x00\xff\u0100\uffff",
604 "\x00\xff\u0100\uffff",
605 "\x00\xff\u0100\uffff",
606 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000607 ]
608 )
609
Walter Dörwalde22d3392005-11-17 08:52:34 +0000610 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200611 tests = [
612 (b'\xff', '\ufffd'),
613 (b'\x00A\xff', 'A\ufffd'),
614 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
615 (b'\xd8\x00', '\ufffd'),
616 (b'\xd8\x00\xdc', '\ufffd'),
617 (b'\xd8\x00\x00A', '\ufffdA'),
618 (b'\xdc\x00\x00A', '\ufffdA'),
619 ]
620 for raw, expected in tests:
621 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
622 raw, 'strict', True)
623 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000624
Victor Stinner53a9dd72010-12-08 22:25:45 +0000625 def test_nonbmp(self):
626 self.assertEqual("\U00010203".encode(self.encoding),
627 b'\xd8\x00\xde\x03')
628 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
629 "\U00010203")
630
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000631class UTF8Test(ReadTest):
632 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000633
634 def test_partial(self):
635 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200636 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000637 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000638 "\x00",
639 "\x00",
640 "\x00\xff",
641 "\x00\xff",
642 "\x00\xff\u07ff",
643 "\x00\xff\u07ff",
644 "\x00\xff\u07ff",
645 "\x00\xff\u07ff\u0800",
646 "\x00\xff\u07ff\u0800",
647 "\x00\xff\u07ff\u0800",
648 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200649 "\x00\xff\u07ff\u0800\uffff",
650 "\x00\xff\u07ff\u0800\uffff",
651 "\x00\xff\u07ff\u0800\uffff",
652 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000653 ]
654 )
655
Walter Dörwald3abcb012007-04-16 22:10:50 +0000656 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000657 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000658 self.check_state_handling_decode(self.encoding,
659 u, u.encode(self.encoding))
660
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000661 def test_lone_surrogates(self):
662 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
663 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000664 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
665 b'[\\udc80]')
666 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
667 b'[&#56448;]')
668 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
669 b'[\x80]')
670 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
671 b'[]')
672 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
673 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000674
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000675 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000676 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
677 b"abc\xed\xa0\x80def")
678 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
679 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000680 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700681 with self.assertRaises(UnicodeDecodeError):
682 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200683 with self.assertRaises(UnicodeDecodeError):
684 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000685
Walter Dörwalde22d3392005-11-17 08:52:34 +0000686class UTF7Test(ReadTest):
687 encoding = "utf-7"
688
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000689 def test_partial(self):
690 self.check_partial(
691 "a+-b",
692 [
693 "a",
694 "a",
695 "a+",
696 "a+-",
697 "a+-b",
698 ]
699 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000700
701class UTF16ExTest(unittest.TestCase):
702
703 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000704 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000705
706 def test_bad_args(self):
707 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
708
709class ReadBufferTest(unittest.TestCase):
710
711 def test_array(self):
712 import array
713 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000714 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000715 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000716 )
717
718 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000719 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000720
721 def test_bad_args(self):
722 self.assertRaises(TypeError, codecs.readbuffer_encode)
723 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
724
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000725class UTF8SigTest(ReadTest):
726 encoding = "utf-8-sig"
727
728 def test_partial(self):
729 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200730 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000731 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000732 "",
733 "",
734 "", # First BOM has been read and skipped
735 "",
736 "",
737 "\ufeff", # Second BOM has been read and emitted
738 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000739 "\ufeff\x00", # First byte of encoded "\xff" read
740 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
741 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
742 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000743 "\ufeff\x00\xff\u07ff",
744 "\ufeff\x00\xff\u07ff",
745 "\ufeff\x00\xff\u07ff\u0800",
746 "\ufeff\x00\xff\u07ff\u0800",
747 "\ufeff\x00\xff\u07ff\u0800",
748 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200749 "\ufeff\x00\xff\u07ff\u0800\uffff",
750 "\ufeff\x00\xff\u07ff\u0800\uffff",
751 "\ufeff\x00\xff\u07ff\u0800\uffff",
752 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000753 ]
754 )
755
Thomas Wouters89f507f2006-12-13 04:49:30 +0000756 def test_bug1601501(self):
757 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000758 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000759
Walter Dörwald3abcb012007-04-16 22:10:50 +0000760 def test_bom(self):
761 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000762 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000763 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
764
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000765 def test_stream_bom(self):
766 unistring = "ABC\u00A1\u2200XYZ"
767 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
768
769 reader = codecs.getreader("utf-8-sig")
770 for sizehint in [None] + list(range(1, 11)) + \
771 [64, 128, 256, 512, 1024]:
772 istream = reader(io.BytesIO(bytestring))
773 ostream = io.StringIO()
774 while 1:
775 if sizehint is not None:
776 data = istream.read(sizehint)
777 else:
778 data = istream.read()
779
780 if not data:
781 break
782 ostream.write(data)
783
784 got = ostream.getvalue()
785 self.assertEqual(got, unistring)
786
787 def test_stream_bare(self):
788 unistring = "ABC\u00A1\u2200XYZ"
789 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
790
791 reader = codecs.getreader("utf-8-sig")
792 for sizehint in [None] + list(range(1, 11)) + \
793 [64, 128, 256, 512, 1024]:
794 istream = reader(io.BytesIO(bytestring))
795 ostream = io.StringIO()
796 while 1:
797 if sizehint is not None:
798 data = istream.read(sizehint)
799 else:
800 data = istream.read()
801
802 if not data:
803 break
804 ostream.write(data)
805
806 got = ostream.getvalue()
807 self.assertEqual(got, unistring)
808
809class EscapeDecodeTest(unittest.TestCase):
810 def test_empty(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200811 self.assertEqual(codecs.escape_decode(b""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000812
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200813 def test_raw(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200814 decode = codecs.escape_decode
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200815 for b in range(256):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200816 b = bytes([b])
817 if b != b'\\':
818 self.assertEqual(decode(b + b'0'), (b + b'0', 2))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200819
820 def test_escape(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200821 decode = codecs.escape_decode
822 check = coding_checker(self, decode)
823 check(b"[\\\n]", b"[]")
824 check(br'[\"]', b'["]')
825 check(br"[\']", b"[']")
826 check(br"[\\]", br"[\]")
827 check(br"[\a]", b"[\x07]")
828 check(br"[\b]", b"[\x08]")
829 check(br"[\t]", b"[\x09]")
830 check(br"[\n]", b"[\x0a]")
831 check(br"[\v]", b"[\x0b]")
832 check(br"[\f]", b"[\x0c]")
833 check(br"[\r]", b"[\x0d]")
834 check(br"[\7]", b"[\x07]")
835 check(br"[\8]", br"[\8]")
836 check(br"[\78]", b"[\x078]")
837 check(br"[\41]", b"[!]")
838 check(br"[\418]", b"[!8]")
839 check(br"[\101]", b"[A]")
840 check(br"[\1010]", b"[A0]")
841 check(br"[\501]", b"[A]")
842 check(br"[\x41]", b"[A]")
843 check(br"[\X41]", br"[\X41]")
844 check(br"[\x410]", b"[A0]")
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200845 for b in range(256):
846 if b not in b'\n"\'\\abtnvfr01234567x':
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200847 b = bytes([b])
848 check(b'\\' + b, b'\\' + b)
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200849
850 def test_errors(self):
Serhiy Storchaka077cb342013-01-29 11:06:53 +0200851 decode = codecs.escape_decode
852 self.assertRaises(ValueError, decode, br"\x")
853 self.assertRaises(ValueError, decode, br"[\x]")
854 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
855 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
856 self.assertRaises(ValueError, decode, br"\x0")
857 self.assertRaises(ValueError, decode, br"[\x0]")
858 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
859 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200860
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000861class RecodingTest(unittest.TestCase):
862 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000863 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000864 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000865 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000866 f2.close()
867 # Python used to crash on this at exit because of a refcount
868 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000869
Martin v. Löwis2548c732003-04-18 10:39:54 +0000870# From RFC 3492
871punycode_testcases = [
872 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000873 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
874 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000875 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000876 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000877 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000878 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000879 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000880 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000881 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000882 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000883 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
884 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
885 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000886 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000887 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000888 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
889 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
890 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000891 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000892 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000893 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000894 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
895 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
896 "\u0939\u0948\u0902",
897 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000898
899 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000900 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000901 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
902 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903
904 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000905 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
906 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
907 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000908 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
909 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000910
911 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000912 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
913 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
914 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
915 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000916 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000917
918 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000919 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
920 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
921 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
922 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
923 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000924 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000925
926 # (K) Vietnamese:
927 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
928 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000929 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
930 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
931 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
932 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000933 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000934
Martin v. Löwis2548c732003-04-18 10:39:54 +0000935 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000937 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000938
Martin v. Löwis2548c732003-04-18 10:39:54 +0000939 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000940 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
941 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
942 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000943 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000944
945 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000946 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
947 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
948 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000949 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950
951 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000953 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954
955 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000956 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
957 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000958 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000959
960 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000961 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000962 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963
964 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000965 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000966 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967
968 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000969 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
970 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000971 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 ]
973
974for i in punycode_testcases:
975 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000976 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000977
978class PunycodeTest(unittest.TestCase):
979 def test_encode(self):
980 for uni, puny in punycode_testcases:
981 # Need to convert both strings to lower case, since
982 # some of the extended encodings use upper case, but our
983 # code produces only lower case. Converting just puny to
984 # lower is also insufficient, since some of the input characters
985 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000986 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000987 str(uni.encode("punycode"), "ascii").lower(),
988 str(puny, "ascii").lower()
989 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000990
991 def test_decode(self):
992 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000993 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000994 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000995 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000997class UnicodeInternalTest(unittest.TestCase):
998 def test_bug1251300(self):
999 # Decoding with unicode_internal used to not correctly handle "code
1000 # points" above 0x10ffff on UCS-4 builds.
1001 if sys.maxunicode > 0xffff:
1002 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +00001003 (b"\x00\x10\xff\xff", "\U0010ffff"),
1004 (b"\x00\x00\x01\x01", "\U00000101"),
1005 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001006 ]
1007 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +00001008 b"\x7f\xff\xff\xff",
1009 b"\x80\x00\x00\x00",
1010 b"\x81\x00\x00\x00",
1011 b"\x00",
1012 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001013 ]
1014 for internal, uni in ok:
1015 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +00001016 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001017 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001018 for internal in not_ok:
1019 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +00001020 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001021 self.assertRaises(UnicodeDecodeError, internal.decode,
1022 "unicode_internal")
1023
1024 def test_decode_error_attributes(self):
1025 if sys.maxunicode > 0xffff:
1026 try:
Walter Dörwald092a2252007-06-07 11:26:16 +00001027 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +00001028 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001029 self.assertEqual("unicode_internal", ex.encoding)
1030 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1031 self.assertEqual(4, ex.start)
1032 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001033 else:
1034 self.fail()
1035
1036 def test_decode_callback(self):
1037 if sys.maxunicode > 0xffff:
1038 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1039 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +00001040 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +00001041 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1042 "ascii"),
1043 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001044 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001045
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001046 def test_encode_length(self):
1047 # Issue 3739
1048 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001049 self.assertEqual(encoder("a")[1], 1)
1050 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001051
Ezio Melottib3aedd42010-11-20 19:04:17 +00001052 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001053
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1055nameprep_tests = [
1056 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001057 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1058 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1059 b'\xb8\x8f\xef\xbb\xbf',
1060 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001062 (b'CAFE',
1063 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001064 # 3.3 Case folding 8bit U+00DF (german sharp s).
1065 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001066 (b'\xc3\x9f',
1067 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001069 (b'\xc4\xb0',
1070 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001072 (b'\xc5\x83\xcd\xba',
1073 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1075 # XXX: skip this as it fails in UCS-2 mode
1076 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1077 # 'telc\xe2\x88\x95kg\xcf\x83'),
1078 (None, None),
1079 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001080 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1081 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001082 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001083 (b'\xe1\xbe\xb7',
1084 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001085 # 3.9 Self-reverting case folding U+01F0 and normalization.
1086 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001087 (b'\xc7\xb0',
1088 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001090 (b'\xce\x90',
1091 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001093 (b'\xce\xb0',
1094 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001096 (b'\xe1\xba\x96',
1097 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001099 (b'\xe1\xbd\x96',
1100 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001102 (b' ',
1103 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001105 (b'\xc2\xa0',
1106 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001107 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001108 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001109 None),
1110 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001111 (b'\xe2\x80\x80',
1112 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001114 (b'\xe2\x80\x8b',
1115 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001117 (b'\xe3\x80\x80',
1118 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001120 (b'\x10\x7f',
1121 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001123 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001124 None),
1125 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001127 None),
1128 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001129 (b'\xef\xbb\xbf',
1130 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001132 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001133 None),
1134 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001135 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001136 None),
1137 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001138 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001139 None),
1140 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001141 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142 None),
1143 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001144 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145 None),
1146 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148 None),
1149 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001150 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001151 None),
1152 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001153 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154 None),
1155 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001156 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157 None),
1158 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 (b'\xcd\x81',
1160 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001162 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163 None),
1164 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001165 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001166 None),
1167 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 None),
1170 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172 None),
1173 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001174 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175 None),
1176 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178 None),
1179 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001180 (b'foo\xef\xb9\xb6bar',
1181 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001182 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001183 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001184 None),
1185 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001186 (b'\xd8\xa71\xd8\xa8',
1187 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001188 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001189 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001191 # None),
1192 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193 # 3.44 Larger test (shrinking).
1194 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001195 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1196 b'\xaa\xce\xb0\xe2\x80\x80',
1197 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001198 # 3.45 Larger test (expanding).
1199 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001200 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1201 b'\x80',
1202 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1203 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1204 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205 ]
1206
1207
1208class NameprepTest(unittest.TestCase):
1209 def test_nameprep(self):
1210 from encodings.idna import nameprep
1211 for pos, (orig, prepped) in enumerate(nameprep_tests):
1212 if orig is None:
1213 # Skipped
1214 continue
1215 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001216 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217 if prepped is None:
1218 # Input contains prohibited characters
1219 self.assertRaises(UnicodeError, nameprep, orig)
1220 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001221 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001223 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001224 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001225 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001226
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001227class IDNACodecTest(unittest.TestCase):
1228 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001229 self.assertEqual(str(b"python.org", "idna"), "python.org")
1230 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1231 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1232 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001233
1234 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001235 self.assertEqual("python.org".encode("idna"), b"python.org")
1236 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1237 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1238 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001239
Martin v. Löwis8b595142005-08-25 11:03:38 +00001240 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001241 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001242 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001243 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001244
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001245 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001246 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001247 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001248 "python.org"
1249 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001250 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001251 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001252 "python.org."
1253 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001254 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001255 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001256 "pyth\xf6n.org."
1257 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001258 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001259 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001260 "pyth\xf6n.org."
1261 )
1262
1263 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001264 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1265 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1266 self.assertEqual(decoder.decode(b"rg"), "")
1267 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001268
1269 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001270 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1271 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1272 self.assertEqual(decoder.decode(b"rg."), "org.")
1273 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274
1275 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001276 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001277 b"".join(codecs.iterencode("python.org", "idna")),
1278 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001279 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001280 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001281 b"".join(codecs.iterencode("python.org.", "idna")),
1282 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001283 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001284 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001285 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1286 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001287 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001288 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001289 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1290 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001291 )
1292
1293 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001294 self.assertEqual(encoder.encode("\xe4x"), b"")
1295 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1296 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001297
1298 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001299 self.assertEqual(encoder.encode("\xe4x"), b"")
1300 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1301 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001302
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001303class CodecsModuleTest(unittest.TestCase):
1304
1305 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001306 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1307 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001308 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001309 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001310 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001311
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001312 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001313 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1314 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001315 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001316 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001317 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001318 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001319
1320 def test_register(self):
1321 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001322 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001323
1324 def test_lookup(self):
1325 self.assertRaises(TypeError, codecs.lookup)
1326 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001327 self.assertRaises(LookupError, codecs.lookup, " ")
1328
1329 def test_getencoder(self):
1330 self.assertRaises(TypeError, codecs.getencoder)
1331 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1332
1333 def test_getdecoder(self):
1334 self.assertRaises(TypeError, codecs.getdecoder)
1335 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1336
1337 def test_getreader(self):
1338 self.assertRaises(TypeError, codecs.getreader)
1339 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1340
1341 def test_getwriter(self):
1342 self.assertRaises(TypeError, codecs.getwriter)
1343 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001344
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001345 def test_lookup_issue1813(self):
1346 # Issue #1813: under Turkish locales, lookup of some codecs failed
1347 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001348 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001349 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1350 try:
1351 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1352 except locale.Error:
1353 # Unsupported locale on this system
1354 self.skipTest('test needs Turkish locale')
1355 c = codecs.lookup('ASCII')
1356 self.assertEqual(c.name, 'ascii')
1357
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001358class StreamReaderTest(unittest.TestCase):
1359
1360 def setUp(self):
1361 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001362 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001363
1364 def test_readlines(self):
1365 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001366 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001367
Thomas Wouters89f507f2006-12-13 04:49:30 +00001368class EncodedFileTest(unittest.TestCase):
1369
1370 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001371 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001372 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001373 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001374
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001375 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001376 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001377 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001378 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001379
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001380all_unicode_encodings = [
1381 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001382 "big5",
1383 "big5hkscs",
1384 "charmap",
1385 "cp037",
1386 "cp1006",
1387 "cp1026",
1388 "cp1140",
1389 "cp1250",
1390 "cp1251",
1391 "cp1252",
1392 "cp1253",
1393 "cp1254",
1394 "cp1255",
1395 "cp1256",
1396 "cp1257",
1397 "cp1258",
1398 "cp424",
1399 "cp437",
1400 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001401 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001402 "cp737",
1403 "cp775",
1404 "cp850",
1405 "cp852",
1406 "cp855",
1407 "cp856",
1408 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001409 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001410 "cp860",
1411 "cp861",
1412 "cp862",
1413 "cp863",
1414 "cp864",
1415 "cp865",
1416 "cp866",
1417 "cp869",
1418 "cp874",
1419 "cp875",
1420 "cp932",
1421 "cp949",
1422 "cp950",
1423 "euc_jis_2004",
1424 "euc_jisx0213",
1425 "euc_jp",
1426 "euc_kr",
1427 "gb18030",
1428 "gb2312",
1429 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001430 "hp_roman8",
1431 "hz",
1432 "idna",
1433 "iso2022_jp",
1434 "iso2022_jp_1",
1435 "iso2022_jp_2",
1436 "iso2022_jp_2004",
1437 "iso2022_jp_3",
1438 "iso2022_jp_ext",
1439 "iso2022_kr",
1440 "iso8859_1",
1441 "iso8859_10",
1442 "iso8859_11",
1443 "iso8859_13",
1444 "iso8859_14",
1445 "iso8859_15",
1446 "iso8859_16",
1447 "iso8859_2",
1448 "iso8859_3",
1449 "iso8859_4",
1450 "iso8859_5",
1451 "iso8859_6",
1452 "iso8859_7",
1453 "iso8859_8",
1454 "iso8859_9",
1455 "johab",
1456 "koi8_r",
1457 "koi8_u",
1458 "latin_1",
1459 "mac_cyrillic",
1460 "mac_greek",
1461 "mac_iceland",
1462 "mac_latin2",
1463 "mac_roman",
1464 "mac_turkish",
1465 "palmos",
1466 "ptcp154",
1467 "punycode",
1468 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001469 "shift_jis",
1470 "shift_jis_2004",
1471 "shift_jisx0213",
1472 "tis_620",
1473 "unicode_escape",
1474 "unicode_internal",
1475 "utf_16",
1476 "utf_16_be",
1477 "utf_16_le",
1478 "utf_7",
1479 "utf_8",
1480]
1481
1482if hasattr(codecs, "mbcs_encode"):
1483 all_unicode_encodings.append("mbcs")
1484
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001485# The following encoding is not tested, because it's not supposed
1486# to work:
1487# "undefined"
1488
1489# The following encodings don't work in stateful mode
1490broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001491 "punycode",
1492 "unicode_internal"
1493]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001494broken_incremental_coders = broken_unicode_with_streams + [
1495 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001496]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001497
Walter Dörwald3abcb012007-04-16 22:10:50 +00001498class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001499 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001500 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001501 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001502 name = codecs.lookup(encoding).name
1503 if encoding.endswith("_codec"):
1504 name += "_codec"
1505 elif encoding == "latin_1":
1506 name = "latin_1"
1507 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001508 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001509 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001510 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001511 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1512
1513 if encoding not in broken_unicode_with_streams:
1514 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001515 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001516 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001517 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001518 for c in s:
1519 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001520 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001521 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001522 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001523 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001524 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001525 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001526 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001527 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001528 decodedresult += reader.read()
1529 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1530
Thomas Wouters89f507f2006-12-13 04:49:30 +00001531 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001532 # check incremental decoder/encoder (fetched via the Python
1533 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001534 try:
1535 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001536 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001537 except LookupError: # no IncrementalEncoder
1538 pass
1539 else:
1540 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001541 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001542 for c in s:
1543 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001544 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001545 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001546 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001547 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001548 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001549 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1551
1552 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001553 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001554 for c in s:
1555 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001557 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001558 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001559 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001560 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001561 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001562 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1563
1564 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001566 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1567
1568 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001569 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1570 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001571
Victor Stinner554f3f02010-06-16 23:33:54 +00001572 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001573 # check incremental decoder/encoder with errors argument
1574 try:
1575 encoder = codecs.getincrementalencoder(encoding)("ignore")
1576 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1577 except LookupError: # no IncrementalEncoder
1578 pass
1579 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001580 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001581 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001582 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001583 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1584
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001585 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001586 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001587 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001588 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1589
Walter Dörwald729c31f2005-03-14 19:06:30 +00001590 def test_seek(self):
1591 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001592 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001593 for encoding in all_unicode_encodings:
1594 if encoding == "idna": # FIXME: See SF bug #1163178
1595 continue
1596 if encoding in broken_unicode_with_streams:
1597 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001598 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001599 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001600 # Test that calling seek resets the internal codec state and buffers
1601 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001602 data = reader.read()
1603 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001604
Walter Dörwalde22d3392005-11-17 08:52:34 +00001605 def test_bad_decode_args(self):
1606 for encoding in all_unicode_encodings:
1607 decoder = codecs.getdecoder(encoding)
1608 self.assertRaises(TypeError, decoder)
1609 if encoding not in ("idna", "punycode"):
1610 self.assertRaises(TypeError, decoder, 42)
1611
1612 def test_bad_encode_args(self):
1613 for encoding in all_unicode_encodings:
1614 encoder = codecs.getencoder(encoding)
1615 self.assertRaises(TypeError, encoder)
1616
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001617 def test_encoding_map_type_initialized(self):
1618 from encodings import cp1140
1619 # This used to crash, we are only verifying there's no crash.
1620 table_type = type(cp1140.encoding_table)
1621 self.assertEqual(table_type, table_type)
1622
Walter Dörwald3abcb012007-04-16 22:10:50 +00001623 def test_decoder_state(self):
1624 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001625 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001626 for encoding in all_unicode_encodings:
1627 if encoding not in broken_incremental_coders:
1628 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1629 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1630
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001631class CharmapTest(unittest.TestCase):
1632 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001633 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001634 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001635 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001636 )
1637
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001638 self.assertRaises(UnicodeDecodeError,
1639 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1640 )
1641
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001642 self.assertRaises(UnicodeDecodeError,
1643 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1644 )
1645
Ezio Melottib3aedd42010-11-20 19:04:17 +00001646 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001647 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001648 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001649 )
1650
Ezio Melottib3aedd42010-11-20 19:04:17 +00001651 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001652 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001653 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001654 )
1655
Ezio Melottib3aedd42010-11-20 19:04:17 +00001656 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001657 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001658 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001659 )
1660
Ezio Melottib3aedd42010-11-20 19:04:17 +00001661 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001662 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001663 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001664 )
1665
Guido van Rossum805365e2007-05-07 22:24:25 +00001666 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001667 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001668 codecs.charmap_decode(allbytes, "ignore", ""),
1669 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001670 )
1671
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001672 def test_decode_with_int2str_map(self):
1673 self.assertEqual(
1674 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1675 {0: 'a', 1: 'b', 2: 'c'}),
1676 ("abc", 3)
1677 )
1678
1679 self.assertEqual(
1680 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1681 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1682 ("AaBbCc", 3)
1683 )
1684
1685 self.assertEqual(
1686 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1687 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1688 ("\U0010FFFFbc", 3)
1689 )
1690
1691 self.assertEqual(
1692 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1693 {0: 'a', 1: 'b', 2: ''}),
1694 ("ab", 3)
1695 )
1696
1697 self.assertRaises(UnicodeDecodeError,
1698 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1699 {0: 'a', 1: 'b'}
1700 )
1701
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001702 self.assertRaises(UnicodeDecodeError,
1703 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1704 {0: 'a', 1: 'b', 2: None}
1705 )
1706
1707 # Issue #14850
1708 self.assertRaises(UnicodeDecodeError,
1709 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1710 {0: 'a', 1: 'b', 2: '\ufffe'}
1711 )
1712
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001713 self.assertEqual(
1714 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1715 {0: 'a', 1: 'b'}),
1716 ("ab\ufffd", 3)
1717 )
1718
1719 self.assertEqual(
1720 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1721 {0: 'a', 1: 'b', 2: None}),
1722 ("ab\ufffd", 3)
1723 )
1724
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001725 # Issue #14850
1726 self.assertEqual(
1727 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1728 {0: 'a', 1: 'b', 2: '\ufffe'}),
1729 ("ab\ufffd", 3)
1730 )
1731
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001732 self.assertEqual(
1733 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1734 {0: 'a', 1: 'b'}),
1735 ("ab", 3)
1736 )
1737
1738 self.assertEqual(
1739 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1740 {0: 'a', 1: 'b', 2: None}),
1741 ("ab", 3)
1742 )
1743
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001744 # Issue #14850
1745 self.assertEqual(
1746 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1747 {0: 'a', 1: 'b', 2: '\ufffe'}),
1748 ("ab", 3)
1749 )
1750
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001751 allbytes = bytes(range(256))
1752 self.assertEqual(
1753 codecs.charmap_decode(allbytes, "ignore", {}),
1754 ("", len(allbytes))
1755 )
1756
1757 def test_decode_with_int2int_map(self):
1758 a = ord('a')
1759 b = ord('b')
1760 c = ord('c')
1761
1762 self.assertEqual(
1763 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1764 {0: a, 1: b, 2: c}),
1765 ("abc", 3)
1766 )
1767
1768 # Issue #15379
1769 self.assertEqual(
1770 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1771 {0: 0x10FFFF, 1: b, 2: c}),
1772 ("\U0010FFFFbc", 3)
1773 )
1774
1775 self.assertRaises(TypeError,
1776 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1777 {0: 0x110000, 1: b, 2: c}
1778 )
1779
1780 self.assertRaises(UnicodeDecodeError,
1781 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1782 {0: a, 1: b},
1783 )
1784
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001785 self.assertRaises(UnicodeDecodeError,
1786 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1787 {0: a, 1: b, 2: 0xFFFE},
1788 )
1789
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001790 self.assertEqual(
1791 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1792 {0: a, 1: b}),
1793 ("ab\ufffd", 3)
1794 )
1795
1796 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001797 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1798 {0: a, 1: b, 2: 0xFFFE}),
1799 ("ab\ufffd", 3)
1800 )
1801
1802 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001803 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1804 {0: a, 1: b}),
1805 ("ab", 3)
1806 )
1807
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001808 self.assertEqual(
1809 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1810 {0: a, 1: b, 2: 0xFFFE}),
1811 ("ab", 3)
1812 )
1813
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001814
Thomas Wouters89f507f2006-12-13 04:49:30 +00001815class WithStmtTest(unittest.TestCase):
1816 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001817 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001818 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001819 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001820
1821 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001822 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001823 info = codecs.lookup("utf-8")
1824 with codecs.StreamReaderWriter(f, info.streamreader,
1825 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001826 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001827
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001828class TypesTest(unittest.TestCase):
1829 def test_decode_unicode(self):
1830 # Most decoders don't accept unicode input
1831 decoders = [
1832 codecs.utf_7_decode,
1833 codecs.utf_8_decode,
1834 codecs.utf_16_le_decode,
1835 codecs.utf_16_be_decode,
1836 codecs.utf_16_ex_decode,
1837 codecs.utf_32_decode,
1838 codecs.utf_32_le_decode,
1839 codecs.utf_32_be_decode,
1840 codecs.utf_32_ex_decode,
1841 codecs.latin_1_decode,
1842 codecs.ascii_decode,
1843 codecs.charmap_decode,
1844 ]
1845 if hasattr(codecs, "mbcs_decode"):
1846 decoders.append(codecs.mbcs_decode)
1847 for decoder in decoders:
1848 self.assertRaises(TypeError, decoder, "xxx")
1849
1850 def test_unicode_escape(self):
1851 # Escape-decoding an unicode string is supported ang gives the same
1852 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001853 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1854 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1855 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1856 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001857
Serhiy Storchakad6793772013-01-29 10:20:44 +02001858
1859class UnicodeEscapeTest(unittest.TestCase):
1860 def test_empty(self):
1861 self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
1862 self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
1863
1864 def test_raw_encode(self):
1865 encode = codecs.unicode_escape_encode
1866 for b in range(32, 127):
1867 if b != b'\\'[0]:
1868 self.assertEqual(encode(chr(b)), (bytes([b]), 1))
1869
1870 def test_raw_decode(self):
1871 decode = codecs.unicode_escape_decode
1872 for b in range(256):
1873 if b != b'\\'[0]:
1874 self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
1875
1876 def test_escape_encode(self):
1877 encode = codecs.unicode_escape_encode
1878 check = coding_checker(self, encode)
1879 check('\t', br'\t')
1880 check('\n', br'\n')
1881 check('\r', br'\r')
1882 check('\\', br'\\')
1883 for b in range(32):
1884 if chr(b) not in '\t\n\r':
1885 check(chr(b), ('\\x%02x' % b).encode())
1886 for b in range(127, 256):
1887 check(chr(b), ('\\x%02x' % b).encode())
1888 check('\u20ac', br'\u20ac')
1889 check('\U0001d120', br'\U0001d120')
1890
1891 def test_escape_decode(self):
1892 decode = codecs.unicode_escape_decode
1893 check = coding_checker(self, decode)
1894 check(b"[\\\n]", "[]")
1895 check(br'[\"]', '["]')
1896 check(br"[\']", "[']")
1897 check(br"[\\]", r"[\]")
1898 check(br"[\a]", "[\x07]")
1899 check(br"[\b]", "[\x08]")
1900 check(br"[\t]", "[\x09]")
1901 check(br"[\n]", "[\x0a]")
1902 check(br"[\v]", "[\x0b]")
1903 check(br"[\f]", "[\x0c]")
1904 check(br"[\r]", "[\x0d]")
1905 check(br"[\7]", "[\x07]")
1906 check(br"[\8]", r"[\8]")
1907 check(br"[\78]", "[\x078]")
1908 check(br"[\41]", "[!]")
1909 check(br"[\418]", "[!8]")
1910 check(br"[\101]", "[A]")
1911 check(br"[\1010]", "[A0]")
1912 check(br"[\x41]", "[A]")
1913 check(br"[\x410]", "[A0]")
1914 check(br"\u20ac", "\u20ac")
1915 check(br"\U0001d120", "\U0001d120")
1916 for b in range(256):
1917 if b not in b'\n"\'\\abtnvfr01234567xuUN':
1918 check(b'\\' + bytes([b]), '\\' + chr(b))
1919
1920 def test_decode_errors(self):
1921 decode = codecs.unicode_escape_decode
1922 for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
1923 for i in range(d):
1924 self.assertRaises(UnicodeDecodeError, decode,
1925 b"\\" + c + b"0"*i)
1926 self.assertRaises(UnicodeDecodeError, decode,
1927 b"[\\" + c + b"0"*i + b"]")
1928 data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
1929 self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
1930 self.assertEqual(decode(data, "replace"),
1931 ("[\ufffd]\ufffd", len(data)))
1932 self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
1933 self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
1934 self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
1935
1936
Martin v. Löwis43c57782009-05-10 08:15:24 +00001937class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001938
1939 def test_utf8(self):
1940 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001941 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001942 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001943 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001944 b"foo\x80bar")
1945 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001946 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001947 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001948 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001949 b"\xed\xb0\x80")
1950
1951 def test_ascii(self):
1952 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001953 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001954 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001955 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001956 b"foo\x80bar")
1957
1958 def test_charmap(self):
1959 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001960 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001961 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001962 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001963 b"foo\xa5bar")
1964
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001965 def test_latin1(self):
1966 # Issue6373
1967 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1968 b"\xe4\xeb\xef\xf6\xfc")
1969
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001970
Victor Stinner3fed0872010-05-22 02:16:27 +00001971class BomTest(unittest.TestCase):
1972 def test_seek0(self):
1973 data = "1234567890"
1974 tests = ("utf-16",
1975 "utf-16-le",
1976 "utf-16-be",
1977 "utf-32",
1978 "utf-32-le",
1979 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001980 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001981 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001982 # Check if the BOM is written only once
1983 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001984 f.write(data)
1985 f.write(data)
1986 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001987 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001988 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001989 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001990
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001991 # Check that the BOM is written after a seek(0)
1992 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1993 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001994 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001995 f.seek(0)
1996 f.write(data)
1997 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001998 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001999
2000 # (StreamWriter) Check that the BOM is written after a seek(0)
2001 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2002 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00002003 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002004 f.writer.seek(0)
2005 f.writer.write(data)
2006 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002007 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002008
2009 # Check that the BOM is not written after a seek() at a position
2010 # different than the start
2011 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2012 f.write(data)
2013 f.seek(f.tell())
2014 f.write(data)
2015 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002016 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002017
2018 # (StreamWriter) Check that the BOM is not written after a seek()
2019 # at a position different than the start
2020 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2021 f.writer.write(data)
2022 f.writer.seek(f.writer.tell())
2023 f.writer.write(data)
2024 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002025 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002026
Victor Stinner3fed0872010-05-22 02:16:27 +00002027
Georg Brandl02524622010-12-02 18:06:51 +00002028bytes_transform_encodings = [
2029 "base64_codec",
2030 "uu_codec",
2031 "quopri_codec",
2032 "hex_codec",
2033]
2034try:
2035 import zlib
2036except ImportError:
2037 pass
2038else:
2039 bytes_transform_encodings.append("zlib_codec")
2040try:
2041 import bz2
2042except ImportError:
2043 pass
2044else:
2045 bytes_transform_encodings.append("bz2_codec")
2046
2047class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002048
Georg Brandl02524622010-12-02 18:06:51 +00002049 def test_basics(self):
2050 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002051 for encoding in bytes_transform_encodings:
2052 # generic codecs interface
2053 (o, size) = codecs.getencoder(encoding)(binput)
2054 self.assertEqual(size, len(binput))
2055 (i, size) = codecs.getdecoder(encoding)(o)
2056 self.assertEqual(size, len(o))
2057 self.assertEqual(i, binput)
2058
Georg Brandl02524622010-12-02 18:06:51 +00002059 def test_read(self):
2060 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002061 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00002062 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2063 sout = reader.read()
2064 self.assertEqual(sout, b"\x80")
2065
2066 def test_readline(self):
2067 for encoding in bytes_transform_encodings:
2068 if encoding in ['uu_codec', 'zlib_codec']:
2069 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002070 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00002071 reader = codecs.getreader(encoding)(io.BytesIO(sin))
2072 sout = reader.readline()
2073 self.assertEqual(sout, b"\x80")
2074
2075
Fred Drake2e2be372001-09-20 21:33:42 +00002076def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002077 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002078 UTF32Test,
2079 UTF32LETest,
2080 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002081 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002082 UTF16LETest,
2083 UTF16BETest,
2084 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002085 UTF8SigTest,
Ezio Melotti26ed2342013-01-11 05:54:57 +02002086 EscapeDecodeTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002087 UTF7Test,
2088 UTF16ExTest,
2089 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002090 RecodingTest,
2091 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002092 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002093 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002094 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002095 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002096 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002097 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002098 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002099 CharmapTest,
2100 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002101 TypesTest,
Serhiy Storchakad6793772013-01-29 10:20:44 +02002102 UnicodeEscapeTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002103 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002104 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002105 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002106 )
Fred Drake2e2be372001-09-20 21:33:42 +00002107
2108
2109if __name__ == "__main__":
2110 test_main()