blob: 2d1b2495938400ee371beadc8ffb4bcfe8670d50 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200316 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200338 "\x00\xff\u0100\uffff",
339 "\x00\xff\u0100\uffff",
340 "\x00\xff\u0100\uffff",
341 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000342 ]
343 )
344
Georg Brandl791f4e12009-09-17 11:41:24 +0000345 def test_handlers(self):
346 self.assertEqual(('\ufffd', 1),
347 codecs.utf_32_decode(b'\x01', 'replace', True))
348 self.assertEqual(('', 1),
349 codecs.utf_32_decode(b'\x01', 'ignore', True))
350
Walter Dörwald41980ca2007-08-16 21:55:45 +0000351 def test_errors(self):
352 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
353 b"\xff", "strict", True)
354
355 def test_decoder_state(self):
356 self.check_state_handling_decode(self.encoding,
357 "spamspam", self.spamle)
358 self.check_state_handling_decode(self.encoding,
359 "spamspam", self.spambe)
360
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
365 self.assertEqual('\U00010000' * 1024,
366 codecs.utf_32_decode(encoded_le)[0])
367 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
368 self.assertEqual('\U00010000' * 1024,
369 codecs.utf_32_decode(encoded_be)[0])
370
Walter Dörwald41980ca2007-08-16 21:55:45 +0000371class UTF32LETest(ReadTest):
372 encoding = "utf-32-le"
373
374 def test_partial(self):
375 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200376 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000377 [
378 "",
379 "",
380 "",
381 "\x00",
382 "\x00",
383 "\x00",
384 "\x00",
385 "\x00\xff",
386 "\x00\xff",
387 "\x00\xff",
388 "\x00\xff",
389 "\x00\xff\u0100",
390 "\x00\xff\u0100",
391 "\x00\xff\u0100",
392 "\x00\xff\u0100",
393 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200394 "\x00\xff\u0100\uffff",
395 "\x00\xff\u0100\uffff",
396 "\x00\xff\u0100\uffff",
397 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000398 ]
399 )
400
401 def test_simple(self):
402 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
403
404 def test_errors(self):
405 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
406 b"\xff", "strict", True)
407
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000408 def test_issue8941(self):
409 # Issue #8941: insufficient result allocation when decoding into
410 # surrogate pairs on UCS-2 builds.
411 encoded = b'\x00\x00\x01\x00' * 1024
412 self.assertEqual('\U00010000' * 1024,
413 codecs.utf_32_le_decode(encoded)[0])
414
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415class UTF32BETest(ReadTest):
416 encoding = "utf-32-be"
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "",
423 "",
424 "",
425 "\x00",
426 "\x00",
427 "\x00",
428 "\x00",
429 "\x00\xff",
430 "\x00\xff",
431 "\x00\xff",
432 "\x00\xff",
433 "\x00\xff\u0100",
434 "\x00\xff\u0100",
435 "\x00\xff\u0100",
436 "\x00\xff\u0100",
437 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200438 "\x00\xff\u0100\uffff",
439 "\x00\xff\u0100\uffff",
440 "\x00\xff\u0100\uffff",
441 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 ]
443 )
444
445 def test_simple(self):
446 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
447
448 def test_errors(self):
449 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
450 b"\xff", "strict", True)
451
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000452 def test_issue8941(self):
453 # Issue #8941: insufficient result allocation when decoding into
454 # surrogate pairs on UCS-2 builds.
455 encoded = b'\x00\x01\x00\x00' * 1024
456 self.assertEqual('\U00010000' * 1024,
457 codecs.utf_32_be_decode(encoded)[0])
458
459
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000460class UTF16Test(ReadTest):
461 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000462
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000463 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
464 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465
466 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000467 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 f.write("spam")
472 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 d = s.getvalue()
474 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000475 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000476 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000477 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000479 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000480
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000481 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000483 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000484 self.assertRaises(UnicodeError, f.read)
485
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000486 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000487 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000488 self.assertRaises(UnicodeError, f.read)
489
Walter Dörwald69652032004-09-07 20:24:22 +0000490 def test_partial(self):
491 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200492 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000493 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000494 "", # first byte of BOM read
495 "", # second byte of BOM read => byteorder known
496 "",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000508 ]
509 )
510
Georg Brandl791f4e12009-09-17 11:41:24 +0000511 def test_handlers(self):
512 self.assertEqual(('\ufffd', 1),
513 codecs.utf_16_decode(b'\x01', 'replace', True))
514 self.assertEqual(('', 1),
515 codecs.utf_16_decode(b'\x01', 'ignore', True))
516
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000518 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000519 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000520
521 def test_decoder_state(self):
522 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000523 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000524 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000525 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000526
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000527 def test_bug691291(self):
528 # Files are always opened in binary mode, even if no binary mode was
529 # specified. This means that no automatic conversion of '\n' is done
530 # on reading and writing.
531 s1 = 'Hello\r\nworld\r\n'
532
533 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200534 self.addCleanup(support.unlink, support.TESTFN)
535 with open(support.TESTFN, 'wb') as fp:
536 fp.write(s)
537 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
538 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000539
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000540class UTF16LETest(ReadTest):
541 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000542
543 def test_partial(self):
544 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200545 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000546 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 "",
548 "\x00",
549 "\x00",
550 "\x00\xff",
551 "\x00\xff",
552 "\x00\xff\u0100",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200555 "\x00\xff\u0100\uffff",
556 "\x00\xff\u0100\uffff",
557 "\x00\xff\u0100\uffff",
558 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000559 ]
560 )
561
Walter Dörwalde22d3392005-11-17 08:52:34 +0000562 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200563 tests = [
564 (b'\xff', '\ufffd'),
565 (b'A\x00Z', 'A\ufffd'),
566 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
567 (b'\x00\xd8', '\ufffd'),
568 (b'\x00\xd8A', '\ufffd'),
569 (b'\x00\xd8A\x00', '\ufffdA'),
570 (b'\x00\xdcA\x00', '\ufffdA'),
571 ]
572 for raw, expected in tests:
573 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
574 raw, 'strict', True)
575 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000576
Victor Stinner53a9dd72010-12-08 22:25:45 +0000577 def test_nonbmp(self):
578 self.assertEqual("\U00010203".encode(self.encoding),
579 b'\x00\xd8\x03\xde')
580 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
581 "\U00010203")
582
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583class UTF16BETest(ReadTest):
584 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000585
586 def test_partial(self):
587 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200588 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000589 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "",
591 "\x00",
592 "\x00",
593 "\x00\xff",
594 "\x00\xff",
595 "\x00\xff\u0100",
596 "\x00\xff\u0100",
597 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200598 "\x00\xff\u0100\uffff",
599 "\x00\xff\u0100\uffff",
600 "\x00\xff\u0100\uffff",
601 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000602 ]
603 )
604
Walter Dörwalde22d3392005-11-17 08:52:34 +0000605 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200606 tests = [
607 (b'\xff', '\ufffd'),
608 (b'\x00A\xff', 'A\ufffd'),
609 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
610 (b'\xd8\x00', '\ufffd'),
611 (b'\xd8\x00\xdc', '\ufffd'),
612 (b'\xd8\x00\x00A', '\ufffdA'),
613 (b'\xdc\x00\x00A', '\ufffdA'),
614 ]
615 for raw, expected in tests:
616 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
617 raw, 'strict', True)
618 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000619
Victor Stinner53a9dd72010-12-08 22:25:45 +0000620 def test_nonbmp(self):
621 self.assertEqual("\U00010203".encode(self.encoding),
622 b'\xd8\x00\xde\x03')
623 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
624 "\U00010203")
625
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000626class UTF8Test(ReadTest):
627 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000628
629 def test_partial(self):
630 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200631 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000632 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "\x00",
634 "\x00",
635 "\x00\xff",
636 "\x00\xff",
637 "\x00\xff\u07ff",
638 "\x00\xff\u07ff",
639 "\x00\xff\u07ff",
640 "\x00\xff\u07ff\u0800",
641 "\x00\xff\u07ff\u0800",
642 "\x00\xff\u07ff\u0800",
643 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200644 "\x00\xff\u07ff\u0800\uffff",
645 "\x00\xff\u07ff\u0800\uffff",
646 "\x00\xff\u07ff\u0800\uffff",
647 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000648 ]
649 )
650
Walter Dörwald3abcb012007-04-16 22:10:50 +0000651 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000652 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000653 self.check_state_handling_decode(self.encoding,
654 u, u.encode(self.encoding))
655
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000656 def test_lone_surrogates(self):
657 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
658 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000659 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
660 b'[\\udc80]')
661 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
662 b'[&#56448;]')
663 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
664 b'[\x80]')
665 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
666 b'[]')
667 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
668 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000669
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000670 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000671 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
672 b"abc\xed\xa0\x80def")
673 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
674 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000675 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700676 with self.assertRaises(UnicodeDecodeError):
677 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200678 with self.assertRaises(UnicodeDecodeError):
679 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681class UTF7Test(ReadTest):
682 encoding = "utf-7"
683
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000684 def test_partial(self):
685 self.check_partial(
686 "a+-b",
687 [
688 "a",
689 "a",
690 "a+",
691 "a+-",
692 "a+-b",
693 ]
694 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
696class UTF16ExTest(unittest.TestCase):
697
698 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000699 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000700
701 def test_bad_args(self):
702 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
703
704class ReadBufferTest(unittest.TestCase):
705
706 def test_array(self):
707 import array
708 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000709 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000710 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000711 )
712
713 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000714 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000715
716 def test_bad_args(self):
717 self.assertRaises(TypeError, codecs.readbuffer_encode)
718 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
719
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000720class UTF8SigTest(ReadTest):
721 encoding = "utf-8-sig"
722
723 def test_partial(self):
724 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200725 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000726 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000727 "",
728 "",
729 "", # First BOM has been read and skipped
730 "",
731 "",
732 "\ufeff", # Second BOM has been read and emitted
733 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000734 "\ufeff\x00", # First byte of encoded "\xff" read
735 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
736 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
737 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000738 "\ufeff\x00\xff\u07ff",
739 "\ufeff\x00\xff\u07ff",
740 "\ufeff\x00\xff\u07ff\u0800",
741 "\ufeff\x00\xff\u07ff\u0800",
742 "\ufeff\x00\xff\u07ff\u0800",
743 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200744 "\ufeff\x00\xff\u07ff\u0800\uffff",
745 "\ufeff\x00\xff\u07ff\u0800\uffff",
746 "\ufeff\x00\xff\u07ff\u0800\uffff",
747 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000748 ]
749 )
750
Thomas Wouters89f507f2006-12-13 04:49:30 +0000751 def test_bug1601501(self):
752 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000753 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000754
Walter Dörwald3abcb012007-04-16 22:10:50 +0000755 def test_bom(self):
756 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000757 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000758 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
759
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000760 def test_stream_bom(self):
761 unistring = "ABC\u00A1\u2200XYZ"
762 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
763
764 reader = codecs.getreader("utf-8-sig")
765 for sizehint in [None] + list(range(1, 11)) + \
766 [64, 128, 256, 512, 1024]:
767 istream = reader(io.BytesIO(bytestring))
768 ostream = io.StringIO()
769 while 1:
770 if sizehint is not None:
771 data = istream.read(sizehint)
772 else:
773 data = istream.read()
774
775 if not data:
776 break
777 ostream.write(data)
778
779 got = ostream.getvalue()
780 self.assertEqual(got, unistring)
781
782 def test_stream_bare(self):
783 unistring = "ABC\u00A1\u2200XYZ"
784 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
785
786 reader = codecs.getreader("utf-8-sig")
787 for sizehint in [None] + list(range(1, 11)) + \
788 [64, 128, 256, 512, 1024]:
789 istream = reader(io.BytesIO(bytestring))
790 ostream = io.StringIO()
791 while 1:
792 if sizehint is not None:
793 data = istream.read(sizehint)
794 else:
795 data = istream.read()
796
797 if not data:
798 break
799 ostream.write(data)
800
801 got = ostream.getvalue()
802 self.assertEqual(got, unistring)
803
804class EscapeDecodeTest(unittest.TestCase):
805 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200806 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000807
Serhiy Storchakaace3ad32013-01-25 23:31:43 +0200808 def test_raw(self):
809 for b in range(256):
810 if b != b'\\'[0]:
811 self.assertEqual(codecs.escape_decode(bytes([b]) + b'0'),
812 (bytes([b]) + b'0', 2))
813
814 def test_escape(self):
815 self.assertEqual(codecs.escape_decode(b"[\\\n]"), (b"[]", 4))
816 self.assertEqual(codecs.escape_decode(br'[\"]'), (b'["]', 4))
817 self.assertEqual(codecs.escape_decode(br"[\']"), (b"[']", 4))
818 self.assertEqual(codecs.escape_decode(br"[\\]"), (br"[\]", 4))
819 self.assertEqual(codecs.escape_decode(br"[\a]"), (b"[\x07]", 4))
820 self.assertEqual(codecs.escape_decode(br"[\b]"), (b"[\x08]", 4))
821 self.assertEqual(codecs.escape_decode(br"[\t]"), (b"[\x09]", 4))
822 self.assertEqual(codecs.escape_decode(br"[\n]"), (b"[\x0a]", 4))
823 self.assertEqual(codecs.escape_decode(br"[\v]"), (b"[\x0b]", 4))
824 self.assertEqual(codecs.escape_decode(br"[\f]"), (b"[\x0c]", 4))
825 self.assertEqual(codecs.escape_decode(br"[\r]"), (b"[\x0d]", 4))
826 self.assertEqual(codecs.escape_decode(br"[\7]"), (b"[\x07]", 4))
827 self.assertEqual(codecs.escape_decode(br"[\8]"), (br"[\8]", 4))
828 self.assertEqual(codecs.escape_decode(br"[\78]"), (b"[\x078]", 5))
829 self.assertEqual(codecs.escape_decode(br"[\41]"), (b"[!]", 5))
830 self.assertEqual(codecs.escape_decode(br"[\418]"), (b"[!8]", 6))
831 self.assertEqual(codecs.escape_decode(br"[\101]"), (b"[A]", 6))
832 self.assertEqual(codecs.escape_decode(br"[\1010]"), (b"[A0]", 7))
833 self.assertEqual(codecs.escape_decode(br"[\501]"), (b"[A]", 6))
834 self.assertEqual(codecs.escape_decode(br"[\x41]"), (b"[A]", 6))
835 self.assertEqual(codecs.escape_decode(br"[\X41]"), (br"[\X41]", 6))
836 self.assertEqual(codecs.escape_decode(br"[\x410]"), (b"[A0]", 7))
837 for b in range(256):
838 if b not in b'\n"\'\\abtnvfr01234567x':
839 self.assertEqual(codecs.escape_decode(b'\\' + bytes([b])),
840 (b'\\' + bytes([b]), 2))
841
842 def test_errors(self):
843 self.assertRaises(ValueError, codecs.escape_decode, br"\x")
844 self.assertRaises(ValueError, codecs.escape_decode, br"[\x]")
845 self.assertEqual(codecs.escape_decode(br"[\x]\x", "ignore"), (b"[]", 6))
846 self.assertEqual(codecs.escape_decode(br"[\x]\x", "replace"), (b"[?]?", 6))
847 self.assertRaises(ValueError, codecs.escape_decode, br"\x0")
848 self.assertRaises(ValueError, codecs.escape_decode, br"[\x0]")
849 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
850 self.assertEqual(codecs.escape_decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
851
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000852class RecodingTest(unittest.TestCase):
853 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000854 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000855 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000856 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000857 f2.close()
858 # Python used to crash on this at exit because of a refcount
859 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000860
Martin v. Löwis2548c732003-04-18 10:39:54 +0000861# From RFC 3492
862punycode_testcases = [
863 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000864 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
865 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000866 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000868 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000869 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000870 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000871 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000872 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000873 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000874 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
875 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
876 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000877 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000878 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000879 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
880 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
881 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000882 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000883 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000885 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
886 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
887 "\u0939\u0948\u0902",
888 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000889
890 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000891 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000892 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
893 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000894
895 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000896 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
897 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
898 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000899 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
900 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000901
902 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000903 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
904 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
905 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
906 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000907 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000908
909 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000910 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
911 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
912 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
913 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
914 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000915 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000916
917 # (K) Vietnamese:
918 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
919 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000920 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
921 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
922 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
923 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000924 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000925
Martin v. Löwis2548c732003-04-18 10:39:54 +0000926 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000927 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000928 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000929
Martin v. Löwis2548c732003-04-18 10:39:54 +0000930 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000931 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
932 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
933 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000934 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000935
936 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000937 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
938 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
939 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000940 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000941
942 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000943 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000944 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945
946 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000947 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
948 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000949 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950
951 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000953 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954
955 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000956 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000957 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000958
959 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000960 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
961 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000962 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000963 ]
964
965for i in punycode_testcases:
966 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000967 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000968
969class PunycodeTest(unittest.TestCase):
970 def test_encode(self):
971 for uni, puny in punycode_testcases:
972 # Need to convert both strings to lower case, since
973 # some of the extended encodings use upper case, but our
974 # code produces only lower case. Converting just puny to
975 # lower is also insufficient, since some of the input characters
976 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000977 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000978 str(uni.encode("punycode"), "ascii").lower(),
979 str(puny, "ascii").lower()
980 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981
982 def test_decode(self):
983 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000984 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000985 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000986 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000988class UnicodeInternalTest(unittest.TestCase):
989 def test_bug1251300(self):
990 # Decoding with unicode_internal used to not correctly handle "code
991 # points" above 0x10ffff on UCS-4 builds.
992 if sys.maxunicode > 0xffff:
993 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000994 (b"\x00\x10\xff\xff", "\U0010ffff"),
995 (b"\x00\x00\x01\x01", "\U00000101"),
996 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000997 ]
998 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000999 b"\x7f\xff\xff\xff",
1000 b"\x80\x00\x00\x00",
1001 b"\x81\x00\x00\x00",
1002 b"\x00",
1003 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001004 ]
1005 for internal, uni in ok:
1006 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +00001007 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001008 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001009 for internal in not_ok:
1010 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +00001011 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001012 self.assertRaises(UnicodeDecodeError, internal.decode,
1013 "unicode_internal")
1014
1015 def test_decode_error_attributes(self):
1016 if sys.maxunicode > 0xffff:
1017 try:
Walter Dörwald092a2252007-06-07 11:26:16 +00001018 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +00001019 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001020 self.assertEqual("unicode_internal", ex.encoding)
1021 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1022 self.assertEqual(4, ex.start)
1023 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001024 else:
1025 self.fail()
1026
1027 def test_decode_callback(self):
1028 if sys.maxunicode > 0xffff:
1029 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1030 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +00001031 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +00001032 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1033 "ascii"),
1034 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001035 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001036
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001037 def test_encode_length(self):
1038 # Issue 3739
1039 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001040 self.assertEqual(encoder("a")[1], 1)
1041 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001042
Ezio Melottib3aedd42010-11-20 19:04:17 +00001043 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001044
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1046nameprep_tests = [
1047 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001048 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1049 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1050 b'\xb8\x8f\xef\xbb\xbf',
1051 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001052 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001053 (b'CAFE',
1054 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001055 # 3.3 Case folding 8bit U+00DF (german sharp s).
1056 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001057 (b'\xc3\x9f',
1058 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001060 (b'\xc4\xb0',
1061 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001063 (b'\xc5\x83\xcd\xba',
1064 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1066 # XXX: skip this as it fails in UCS-2 mode
1067 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1068 # 'telc\xe2\x88\x95kg\xcf\x83'),
1069 (None, None),
1070 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001071 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1072 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001073 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001074 (b'\xe1\xbe\xb7',
1075 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001076 # 3.9 Self-reverting case folding U+01F0 and normalization.
1077 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001078 (b'\xc7\xb0',
1079 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001081 (b'\xce\x90',
1082 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001084 (b'\xce\xb0',
1085 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001087 (b'\xe1\xba\x96',
1088 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001090 (b'\xe1\xbd\x96',
1091 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001093 (b' ',
1094 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001096 (b'\xc2\xa0',
1097 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001099 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001100 None),
1101 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001102 (b'\xe2\x80\x80',
1103 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001105 (b'\xe2\x80\x8b',
1106 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001107 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001108 (b'\xe3\x80\x80',
1109 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001111 (b'\x10\x7f',
1112 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001114 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115 None),
1116 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001117 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118 None),
1119 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001120 (b'\xef\xbb\xbf',
1121 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001123 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001124 None),
1125 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001127 None),
1128 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001129 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001130 None),
1131 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001132 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001133 None),
1134 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001135 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001136 None),
1137 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001138 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001139 None),
1140 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001141 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142 None),
1143 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001144 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145 None),
1146 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148 None),
1149 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001150 (b'\xcd\x81',
1151 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001153 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154 None),
1155 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001156 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157 None),
1158 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001160 None),
1161 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001162 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163 None),
1164 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001165 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001166 None),
1167 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 None),
1170 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 (b'foo\xef\xb9\xb6bar',
1172 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001174 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175 None),
1176 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 (b'\xd8\xa71\xd8\xa8',
1178 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001179 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001180 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001181 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001182 # None),
1183 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001184 # 3.44 Larger test (shrinking).
1185 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001186 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1187 b'\xaa\xce\xb0\xe2\x80\x80',
1188 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189 # 3.45 Larger test (expanding).
1190 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001191 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1192 b'\x80',
1193 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1194 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1195 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196 ]
1197
1198
1199class NameprepTest(unittest.TestCase):
1200 def test_nameprep(self):
1201 from encodings.idna import nameprep
1202 for pos, (orig, prepped) in enumerate(nameprep_tests):
1203 if orig is None:
1204 # Skipped
1205 continue
1206 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001207 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208 if prepped is None:
1209 # Input contains prohibited characters
1210 self.assertRaises(UnicodeError, nameprep, orig)
1211 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001212 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001214 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001215 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001216 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001218class IDNACodecTest(unittest.TestCase):
1219 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001220 self.assertEqual(str(b"python.org", "idna"), "python.org")
1221 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1222 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1223 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001224
1225 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001226 self.assertEqual("python.org".encode("idna"), b"python.org")
1227 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1228 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1229 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001230
Martin v. Löwis8b595142005-08-25 11:03:38 +00001231 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001232 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001233 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001234 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001235
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001236 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001237 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001238 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001239 "python.org"
1240 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001241 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001242 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001243 "python.org."
1244 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001245 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001246 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001247 "pyth\xf6n.org."
1248 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001249 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001250 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001251 "pyth\xf6n.org."
1252 )
1253
1254 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001255 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1256 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1257 self.assertEqual(decoder.decode(b"rg"), "")
1258 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001259
1260 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001261 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1262 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1263 self.assertEqual(decoder.decode(b"rg."), "org.")
1264 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265
1266 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001267 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001268 b"".join(codecs.iterencode("python.org", "idna")),
1269 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001270 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001271 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001272 b"".join(codecs.iterencode("python.org.", "idna")),
1273 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001274 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001275 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001276 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1277 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001278 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001279 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001280 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1281 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001282 )
1283
1284 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001285 self.assertEqual(encoder.encode("\xe4x"), b"")
1286 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1287 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001288
1289 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001290 self.assertEqual(encoder.encode("\xe4x"), b"")
1291 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1292 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001293
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001294class CodecsModuleTest(unittest.TestCase):
1295
1296 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001297 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1298 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001299 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001300 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001301 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001302
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001303 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001304 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1305 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001306 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001307 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001308 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001309 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001310
1311 def test_register(self):
1312 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001313 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001314
1315 def test_lookup(self):
1316 self.assertRaises(TypeError, codecs.lookup)
1317 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001318 self.assertRaises(LookupError, codecs.lookup, " ")
1319
1320 def test_getencoder(self):
1321 self.assertRaises(TypeError, codecs.getencoder)
1322 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1323
1324 def test_getdecoder(self):
1325 self.assertRaises(TypeError, codecs.getdecoder)
1326 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1327
1328 def test_getreader(self):
1329 self.assertRaises(TypeError, codecs.getreader)
1330 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1331
1332 def test_getwriter(self):
1333 self.assertRaises(TypeError, codecs.getwriter)
1334 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001335
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001336 def test_lookup_issue1813(self):
1337 # Issue #1813: under Turkish locales, lookup of some codecs failed
1338 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001339 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001340 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1341 try:
1342 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1343 except locale.Error:
1344 # Unsupported locale on this system
1345 self.skipTest('test needs Turkish locale')
1346 c = codecs.lookup('ASCII')
1347 self.assertEqual(c.name, 'ascii')
1348
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001349class StreamReaderTest(unittest.TestCase):
1350
1351 def setUp(self):
1352 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001353 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001354
1355 def test_readlines(self):
1356 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001357 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001358
Thomas Wouters89f507f2006-12-13 04:49:30 +00001359class EncodedFileTest(unittest.TestCase):
1360
1361 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001362 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001363 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001364 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001365
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001366 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001367 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001368 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001369 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001370
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001371all_unicode_encodings = [
1372 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001373 "big5",
1374 "big5hkscs",
1375 "charmap",
1376 "cp037",
1377 "cp1006",
1378 "cp1026",
1379 "cp1140",
1380 "cp1250",
1381 "cp1251",
1382 "cp1252",
1383 "cp1253",
1384 "cp1254",
1385 "cp1255",
1386 "cp1256",
1387 "cp1257",
1388 "cp1258",
1389 "cp424",
1390 "cp437",
1391 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001392 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001393 "cp737",
1394 "cp775",
1395 "cp850",
1396 "cp852",
1397 "cp855",
1398 "cp856",
1399 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001400 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001401 "cp860",
1402 "cp861",
1403 "cp862",
1404 "cp863",
1405 "cp864",
1406 "cp865",
1407 "cp866",
1408 "cp869",
1409 "cp874",
1410 "cp875",
1411 "cp932",
1412 "cp949",
1413 "cp950",
1414 "euc_jis_2004",
1415 "euc_jisx0213",
1416 "euc_jp",
1417 "euc_kr",
1418 "gb18030",
1419 "gb2312",
1420 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001421 "hp_roman8",
1422 "hz",
1423 "idna",
1424 "iso2022_jp",
1425 "iso2022_jp_1",
1426 "iso2022_jp_2",
1427 "iso2022_jp_2004",
1428 "iso2022_jp_3",
1429 "iso2022_jp_ext",
1430 "iso2022_kr",
1431 "iso8859_1",
1432 "iso8859_10",
1433 "iso8859_11",
1434 "iso8859_13",
1435 "iso8859_14",
1436 "iso8859_15",
1437 "iso8859_16",
1438 "iso8859_2",
1439 "iso8859_3",
1440 "iso8859_4",
1441 "iso8859_5",
1442 "iso8859_6",
1443 "iso8859_7",
1444 "iso8859_8",
1445 "iso8859_9",
1446 "johab",
1447 "koi8_r",
1448 "koi8_u",
1449 "latin_1",
1450 "mac_cyrillic",
1451 "mac_greek",
1452 "mac_iceland",
1453 "mac_latin2",
1454 "mac_roman",
1455 "mac_turkish",
1456 "palmos",
1457 "ptcp154",
1458 "punycode",
1459 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001460 "shift_jis",
1461 "shift_jis_2004",
1462 "shift_jisx0213",
1463 "tis_620",
1464 "unicode_escape",
1465 "unicode_internal",
1466 "utf_16",
1467 "utf_16_be",
1468 "utf_16_le",
1469 "utf_7",
1470 "utf_8",
1471]
1472
1473if hasattr(codecs, "mbcs_encode"):
1474 all_unicode_encodings.append("mbcs")
1475
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001476# The following encoding is not tested, because it's not supposed
1477# to work:
1478# "undefined"
1479
1480# The following encodings don't work in stateful mode
1481broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001482 "punycode",
1483 "unicode_internal"
1484]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001485broken_incremental_coders = broken_unicode_with_streams + [
1486 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001487]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001488
Walter Dörwald3abcb012007-04-16 22:10:50 +00001489class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001490 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001491 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001492 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001493 name = codecs.lookup(encoding).name
1494 if encoding.endswith("_codec"):
1495 name += "_codec"
1496 elif encoding == "latin_1":
1497 name = "latin_1"
1498 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001499 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001500 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001501 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001502 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1503
1504 if encoding not in broken_unicode_with_streams:
1505 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001506 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001507 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001508 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001509 for c in s:
1510 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001511 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001512 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001513 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001514 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001515 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001516 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001517 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001518 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001519 decodedresult += reader.read()
1520 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1521
Thomas Wouters89f507f2006-12-13 04:49:30 +00001522 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001523 # check incremental decoder/encoder (fetched via the Python
1524 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001525 try:
1526 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001527 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001528 except LookupError: # no IncrementalEncoder
1529 pass
1530 else:
1531 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001532 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001533 for c in s:
1534 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001535 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001536 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001537 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001538 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001539 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001540 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001541 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1542
1543 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001544 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001545 for c in s:
1546 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001547 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001548 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001549 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001550 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001551 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001552 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001553 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1554
1555 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001557 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1558
1559 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001560 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1561 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001562
Victor Stinner554f3f02010-06-16 23:33:54 +00001563 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001564 # check incremental decoder/encoder with errors argument
1565 try:
1566 encoder = codecs.getincrementalencoder(encoding)("ignore")
1567 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1568 except LookupError: # no IncrementalEncoder
1569 pass
1570 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001571 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001572 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001573 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001574 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1575
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001576 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001577 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001578 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001579 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1580
Walter Dörwald729c31f2005-03-14 19:06:30 +00001581 def test_seek(self):
1582 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001583 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001584 for encoding in all_unicode_encodings:
1585 if encoding == "idna": # FIXME: See SF bug #1163178
1586 continue
1587 if encoding in broken_unicode_with_streams:
1588 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001589 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001590 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001591 # Test that calling seek resets the internal codec state and buffers
1592 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001593 data = reader.read()
1594 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001595
Walter Dörwalde22d3392005-11-17 08:52:34 +00001596 def test_bad_decode_args(self):
1597 for encoding in all_unicode_encodings:
1598 decoder = codecs.getdecoder(encoding)
1599 self.assertRaises(TypeError, decoder)
1600 if encoding not in ("idna", "punycode"):
1601 self.assertRaises(TypeError, decoder, 42)
1602
1603 def test_bad_encode_args(self):
1604 for encoding in all_unicode_encodings:
1605 encoder = codecs.getencoder(encoding)
1606 self.assertRaises(TypeError, encoder)
1607
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001608 def test_encoding_map_type_initialized(self):
1609 from encodings import cp1140
1610 # This used to crash, we are only verifying there's no crash.
1611 table_type = type(cp1140.encoding_table)
1612 self.assertEqual(table_type, table_type)
1613
Walter Dörwald3abcb012007-04-16 22:10:50 +00001614 def test_decoder_state(self):
1615 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001616 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001617 for encoding in all_unicode_encodings:
1618 if encoding not in broken_incremental_coders:
1619 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1620 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1621
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001622class CharmapTest(unittest.TestCase):
1623 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001624 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001625 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001626 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001627 )
1628
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001629 self.assertRaises(UnicodeDecodeError,
1630 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1631 )
1632
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001633 self.assertRaises(UnicodeDecodeError,
1634 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1635 )
1636
Ezio Melottib3aedd42010-11-20 19:04:17 +00001637 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001638 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001639 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001640 )
1641
Ezio Melottib3aedd42010-11-20 19:04:17 +00001642 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001643 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001644 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001645 )
1646
Ezio Melottib3aedd42010-11-20 19:04:17 +00001647 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001648 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001650 )
1651
Ezio Melottib3aedd42010-11-20 19:04:17 +00001652 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001653 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001654 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001655 )
1656
Guido van Rossum805365e2007-05-07 22:24:25 +00001657 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001658 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001659 codecs.charmap_decode(allbytes, "ignore", ""),
1660 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001661 )
1662
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001663 def test_decode_with_int2str_map(self):
1664 self.assertEqual(
1665 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1666 {0: 'a', 1: 'b', 2: 'c'}),
1667 ("abc", 3)
1668 )
1669
1670 self.assertEqual(
1671 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1672 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1673 ("AaBbCc", 3)
1674 )
1675
1676 self.assertEqual(
1677 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1678 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1679 ("\U0010FFFFbc", 3)
1680 )
1681
1682 self.assertEqual(
1683 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1684 {0: 'a', 1: 'b', 2: ''}),
1685 ("ab", 3)
1686 )
1687
1688 self.assertRaises(UnicodeDecodeError,
1689 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1690 {0: 'a', 1: 'b'}
1691 )
1692
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001693 self.assertRaises(UnicodeDecodeError,
1694 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1695 {0: 'a', 1: 'b', 2: None}
1696 )
1697
1698 # Issue #14850
1699 self.assertRaises(UnicodeDecodeError,
1700 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1701 {0: 'a', 1: 'b', 2: '\ufffe'}
1702 )
1703
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001704 self.assertEqual(
1705 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1706 {0: 'a', 1: 'b'}),
1707 ("ab\ufffd", 3)
1708 )
1709
1710 self.assertEqual(
1711 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1712 {0: 'a', 1: 'b', 2: None}),
1713 ("ab\ufffd", 3)
1714 )
1715
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001716 # Issue #14850
1717 self.assertEqual(
1718 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1719 {0: 'a', 1: 'b', 2: '\ufffe'}),
1720 ("ab\ufffd", 3)
1721 )
1722
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001723 self.assertEqual(
1724 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1725 {0: 'a', 1: 'b'}),
1726 ("ab", 3)
1727 )
1728
1729 self.assertEqual(
1730 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1731 {0: 'a', 1: 'b', 2: None}),
1732 ("ab", 3)
1733 )
1734
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001735 # Issue #14850
1736 self.assertEqual(
1737 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1738 {0: 'a', 1: 'b', 2: '\ufffe'}),
1739 ("ab", 3)
1740 )
1741
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001742 allbytes = bytes(range(256))
1743 self.assertEqual(
1744 codecs.charmap_decode(allbytes, "ignore", {}),
1745 ("", len(allbytes))
1746 )
1747
1748 def test_decode_with_int2int_map(self):
1749 a = ord('a')
1750 b = ord('b')
1751 c = ord('c')
1752
1753 self.assertEqual(
1754 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1755 {0: a, 1: b, 2: c}),
1756 ("abc", 3)
1757 )
1758
1759 # Issue #15379
1760 self.assertEqual(
1761 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1762 {0: 0x10FFFF, 1: b, 2: c}),
1763 ("\U0010FFFFbc", 3)
1764 )
1765
1766 self.assertRaises(TypeError,
1767 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1768 {0: 0x110000, 1: b, 2: c}
1769 )
1770
1771 self.assertRaises(UnicodeDecodeError,
1772 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1773 {0: a, 1: b},
1774 )
1775
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001776 self.assertRaises(UnicodeDecodeError,
1777 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1778 {0: a, 1: b, 2: 0xFFFE},
1779 )
1780
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001781 self.assertEqual(
1782 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1783 {0: a, 1: b}),
1784 ("ab\ufffd", 3)
1785 )
1786
1787 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001788 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1789 {0: a, 1: b, 2: 0xFFFE}),
1790 ("ab\ufffd", 3)
1791 )
1792
1793 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001794 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1795 {0: a, 1: b}),
1796 ("ab", 3)
1797 )
1798
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001799 self.assertEqual(
1800 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1801 {0: a, 1: b, 2: 0xFFFE}),
1802 ("ab", 3)
1803 )
1804
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001805
Thomas Wouters89f507f2006-12-13 04:49:30 +00001806class WithStmtTest(unittest.TestCase):
1807 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001808 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001809 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001810 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001811
1812 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001813 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001814 info = codecs.lookup("utf-8")
1815 with codecs.StreamReaderWriter(f, info.streamreader,
1816 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001817 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001818
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001819class TypesTest(unittest.TestCase):
1820 def test_decode_unicode(self):
1821 # Most decoders don't accept unicode input
1822 decoders = [
1823 codecs.utf_7_decode,
1824 codecs.utf_8_decode,
1825 codecs.utf_16_le_decode,
1826 codecs.utf_16_be_decode,
1827 codecs.utf_16_ex_decode,
1828 codecs.utf_32_decode,
1829 codecs.utf_32_le_decode,
1830 codecs.utf_32_be_decode,
1831 codecs.utf_32_ex_decode,
1832 codecs.latin_1_decode,
1833 codecs.ascii_decode,
1834 codecs.charmap_decode,
1835 ]
1836 if hasattr(codecs, "mbcs_decode"):
1837 decoders.append(codecs.mbcs_decode)
1838 for decoder in decoders:
1839 self.assertRaises(TypeError, decoder, "xxx")
1840
1841 def test_unicode_escape(self):
1842 # Escape-decoding an unicode string is supported ang gives the same
1843 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001844 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1845 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1846 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1847 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001848
Martin v. Löwis43c57782009-05-10 08:15:24 +00001849class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001850
1851 def test_utf8(self):
1852 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001853 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001854 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001855 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001856 b"foo\x80bar")
1857 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001858 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001859 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001860 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001861 b"\xed\xb0\x80")
1862
1863 def test_ascii(self):
1864 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001865 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001866 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001867 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001868 b"foo\x80bar")
1869
1870 def test_charmap(self):
1871 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001872 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001873 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001874 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001875 b"foo\xa5bar")
1876
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001877 def test_latin1(self):
1878 # Issue6373
1879 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1880 b"\xe4\xeb\xef\xf6\xfc")
1881
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001882
Victor Stinner3fed0872010-05-22 02:16:27 +00001883class BomTest(unittest.TestCase):
1884 def test_seek0(self):
1885 data = "1234567890"
1886 tests = ("utf-16",
1887 "utf-16-le",
1888 "utf-16-be",
1889 "utf-32",
1890 "utf-32-le",
1891 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001892 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001893 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001894 # Check if the BOM is written only once
1895 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001896 f.write(data)
1897 f.write(data)
1898 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001899 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001900 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001901 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001902
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001903 # Check that the BOM is written after a seek(0)
1904 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1905 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001906 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001907 f.seek(0)
1908 f.write(data)
1909 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001910 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001911
1912 # (StreamWriter) Check that the BOM is written after a seek(0)
1913 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1914 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001915 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001916 f.writer.seek(0)
1917 f.writer.write(data)
1918 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001919 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001920
1921 # Check that the BOM is not written after a seek() at a position
1922 # different than the start
1923 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1924 f.write(data)
1925 f.seek(f.tell())
1926 f.write(data)
1927 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001928 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001929
1930 # (StreamWriter) Check that the BOM is not written after a seek()
1931 # at a position different than the start
1932 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1933 f.writer.write(data)
1934 f.writer.seek(f.writer.tell())
1935 f.writer.write(data)
1936 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001937 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001938
Victor Stinner3fed0872010-05-22 02:16:27 +00001939
Georg Brandl02524622010-12-02 18:06:51 +00001940bytes_transform_encodings = [
1941 "base64_codec",
1942 "uu_codec",
1943 "quopri_codec",
1944 "hex_codec",
1945]
1946try:
1947 import zlib
1948except ImportError:
1949 pass
1950else:
1951 bytes_transform_encodings.append("zlib_codec")
1952try:
1953 import bz2
1954except ImportError:
1955 pass
1956else:
1957 bytes_transform_encodings.append("bz2_codec")
1958
1959class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001960
Georg Brandl02524622010-12-02 18:06:51 +00001961 def test_basics(self):
1962 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001963 for encoding in bytes_transform_encodings:
1964 # generic codecs interface
1965 (o, size) = codecs.getencoder(encoding)(binput)
1966 self.assertEqual(size, len(binput))
1967 (i, size) = codecs.getdecoder(encoding)(o)
1968 self.assertEqual(size, len(o))
1969 self.assertEqual(i, binput)
1970
Georg Brandl02524622010-12-02 18:06:51 +00001971 def test_read(self):
1972 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001973 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001974 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1975 sout = reader.read()
1976 self.assertEqual(sout, b"\x80")
1977
1978 def test_readline(self):
1979 for encoding in bytes_transform_encodings:
1980 if encoding in ['uu_codec', 'zlib_codec']:
1981 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001982 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001983 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1984 sout = reader.readline()
1985 self.assertEqual(sout, b"\x80")
1986
1987
Fred Drake2e2be372001-09-20 21:33:42 +00001988def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001989 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001990 UTF32Test,
1991 UTF32LETest,
1992 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001993 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001994 UTF16LETest,
1995 UTF16BETest,
1996 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001997 UTF8SigTest,
Ezio Melotti26ed2342013-01-11 05:54:57 +02001998 EscapeDecodeTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001999 UTF7Test,
2000 UTF16ExTest,
2001 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002002 RecodingTest,
2003 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002004 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002005 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002006 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002007 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002008 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002009 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002010 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002011 CharmapTest,
2012 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002013 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002014 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002015 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002016 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002017 )
Fred Drake2e2be372001-09-20 21:33:42 +00002018
2019
2020if __name__ == "__main__":
2021 test_main()