blob: 1fa9ee0f5e1fd88dd0d184cd916643ee01ccfad3 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200316 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200338 "\x00\xff\u0100\uffff",
339 "\x00\xff\u0100\uffff",
340 "\x00\xff\u0100\uffff",
341 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000342 ]
343 )
344
Georg Brandl791f4e12009-09-17 11:41:24 +0000345 def test_handlers(self):
346 self.assertEqual(('\ufffd', 1),
347 codecs.utf_32_decode(b'\x01', 'replace', True))
348 self.assertEqual(('', 1),
349 codecs.utf_32_decode(b'\x01', 'ignore', True))
350
Walter Dörwald41980ca2007-08-16 21:55:45 +0000351 def test_errors(self):
352 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
353 b"\xff", "strict", True)
354
355 def test_decoder_state(self):
356 self.check_state_handling_decode(self.encoding,
357 "spamspam", self.spamle)
358 self.check_state_handling_decode(self.encoding,
359 "spamspam", self.spambe)
360
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
365 self.assertEqual('\U00010000' * 1024,
366 codecs.utf_32_decode(encoded_le)[0])
367 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
368 self.assertEqual('\U00010000' * 1024,
369 codecs.utf_32_decode(encoded_be)[0])
370
Walter Dörwald41980ca2007-08-16 21:55:45 +0000371class UTF32LETest(ReadTest):
372 encoding = "utf-32-le"
373
374 def test_partial(self):
375 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200376 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000377 [
378 "",
379 "",
380 "",
381 "\x00",
382 "\x00",
383 "\x00",
384 "\x00",
385 "\x00\xff",
386 "\x00\xff",
387 "\x00\xff",
388 "\x00\xff",
389 "\x00\xff\u0100",
390 "\x00\xff\u0100",
391 "\x00\xff\u0100",
392 "\x00\xff\u0100",
393 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200394 "\x00\xff\u0100\uffff",
395 "\x00\xff\u0100\uffff",
396 "\x00\xff\u0100\uffff",
397 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000398 ]
399 )
400
401 def test_simple(self):
402 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
403
404 def test_errors(self):
405 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
406 b"\xff", "strict", True)
407
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000408 def test_issue8941(self):
409 # Issue #8941: insufficient result allocation when decoding into
410 # surrogate pairs on UCS-2 builds.
411 encoded = b'\x00\x00\x01\x00' * 1024
412 self.assertEqual('\U00010000' * 1024,
413 codecs.utf_32_le_decode(encoded)[0])
414
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415class UTF32BETest(ReadTest):
416 encoding = "utf-32-be"
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "",
423 "",
424 "",
425 "\x00",
426 "\x00",
427 "\x00",
428 "\x00",
429 "\x00\xff",
430 "\x00\xff",
431 "\x00\xff",
432 "\x00\xff",
433 "\x00\xff\u0100",
434 "\x00\xff\u0100",
435 "\x00\xff\u0100",
436 "\x00\xff\u0100",
437 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200438 "\x00\xff\u0100\uffff",
439 "\x00\xff\u0100\uffff",
440 "\x00\xff\u0100\uffff",
441 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 ]
443 )
444
445 def test_simple(self):
446 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
447
448 def test_errors(self):
449 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
450 b"\xff", "strict", True)
451
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000452 def test_issue8941(self):
453 # Issue #8941: insufficient result allocation when decoding into
454 # surrogate pairs on UCS-2 builds.
455 encoded = b'\x00\x01\x00\x00' * 1024
456 self.assertEqual('\U00010000' * 1024,
457 codecs.utf_32_be_decode(encoded)[0])
458
459
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000460class UTF16Test(ReadTest):
461 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000462
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000463 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
464 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465
466 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000467 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 f.write("spam")
472 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 d = s.getvalue()
474 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000475 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000476 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000477 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000479 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000480
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000481 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000483 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000484 self.assertRaises(UnicodeError, f.read)
485
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000486 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000487 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000488 self.assertRaises(UnicodeError, f.read)
489
Walter Dörwald69652032004-09-07 20:24:22 +0000490 def test_partial(self):
491 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200492 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000493 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000494 "", # first byte of BOM read
495 "", # second byte of BOM read => byteorder known
496 "",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000508 ]
509 )
510
Georg Brandl791f4e12009-09-17 11:41:24 +0000511 def test_handlers(self):
512 self.assertEqual(('\ufffd', 1),
513 codecs.utf_16_decode(b'\x01', 'replace', True))
514 self.assertEqual(('', 1),
515 codecs.utf_16_decode(b'\x01', 'ignore', True))
516
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000518 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000519 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000520
521 def test_decoder_state(self):
522 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000523 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000524 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000525 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000526
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000527 def test_bug691291(self):
528 # Files are always opened in binary mode, even if no binary mode was
529 # specified. This means that no automatic conversion of '\n' is done
530 # on reading and writing.
531 s1 = 'Hello\r\nworld\r\n'
532
533 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200534 self.addCleanup(support.unlink, support.TESTFN)
535 with open(support.TESTFN, 'wb') as fp:
536 fp.write(s)
537 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
538 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000539
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000540class UTF16LETest(ReadTest):
541 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000542
543 def test_partial(self):
544 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200545 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000546 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 "",
548 "\x00",
549 "\x00",
550 "\x00\xff",
551 "\x00\xff",
552 "\x00\xff\u0100",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200555 "\x00\xff\u0100\uffff",
556 "\x00\xff\u0100\uffff",
557 "\x00\xff\u0100\uffff",
558 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000559 ]
560 )
561
Walter Dörwalde22d3392005-11-17 08:52:34 +0000562 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200563 tests = [
564 (b'\xff', '\ufffd'),
565 (b'A\x00Z', 'A\ufffd'),
566 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
567 (b'\x00\xd8', '\ufffd'),
568 (b'\x00\xd8A', '\ufffd'),
569 (b'\x00\xd8A\x00', '\ufffdA'),
570 (b'\x00\xdcA\x00', '\ufffdA'),
571 ]
572 for raw, expected in tests:
573 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
574 raw, 'strict', True)
575 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000576
Victor Stinner53a9dd72010-12-08 22:25:45 +0000577 def test_nonbmp(self):
578 self.assertEqual("\U00010203".encode(self.encoding),
579 b'\x00\xd8\x03\xde')
580 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
581 "\U00010203")
582
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583class UTF16BETest(ReadTest):
584 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000585
586 def test_partial(self):
587 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200588 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000589 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "",
591 "\x00",
592 "\x00",
593 "\x00\xff",
594 "\x00\xff",
595 "\x00\xff\u0100",
596 "\x00\xff\u0100",
597 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200598 "\x00\xff\u0100\uffff",
599 "\x00\xff\u0100\uffff",
600 "\x00\xff\u0100\uffff",
601 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000602 ]
603 )
604
Walter Dörwalde22d3392005-11-17 08:52:34 +0000605 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200606 tests = [
607 (b'\xff', '\ufffd'),
608 (b'\x00A\xff', 'A\ufffd'),
609 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
610 (b'\xd8\x00', '\ufffd'),
611 (b'\xd8\x00\xdc', '\ufffd'),
612 (b'\xd8\x00\x00A', '\ufffdA'),
613 (b'\xdc\x00\x00A', '\ufffdA'),
614 ]
615 for raw, expected in tests:
616 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
617 raw, 'strict', True)
618 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000619
Victor Stinner53a9dd72010-12-08 22:25:45 +0000620 def test_nonbmp(self):
621 self.assertEqual("\U00010203".encode(self.encoding),
622 b'\xd8\x00\xde\x03')
623 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
624 "\U00010203")
625
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000626class UTF8Test(ReadTest):
627 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000628
629 def test_partial(self):
630 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200631 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000632 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "\x00",
634 "\x00",
635 "\x00\xff",
636 "\x00\xff",
637 "\x00\xff\u07ff",
638 "\x00\xff\u07ff",
639 "\x00\xff\u07ff",
640 "\x00\xff\u07ff\u0800",
641 "\x00\xff\u07ff\u0800",
642 "\x00\xff\u07ff\u0800",
643 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200644 "\x00\xff\u07ff\u0800\uffff",
645 "\x00\xff\u07ff\u0800\uffff",
646 "\x00\xff\u07ff\u0800\uffff",
647 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000648 ]
649 )
650
Walter Dörwald3abcb012007-04-16 22:10:50 +0000651 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000652 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000653 self.check_state_handling_decode(self.encoding,
654 u, u.encode(self.encoding))
655
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000656 def test_lone_surrogates(self):
657 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
658 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000659 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
660 b'[\\udc80]')
661 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
662 b'[&#56448;]')
663 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
664 b'[\x80]')
665 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
666 b'[]')
667 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
668 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000669
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000670 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000671 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
672 b"abc\xed\xa0\x80def")
673 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
674 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000675 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700676 with self.assertRaises(UnicodeDecodeError):
677 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200678 with self.assertRaises(UnicodeDecodeError):
679 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681class UTF7Test(ReadTest):
682 encoding = "utf-7"
683
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000684 def test_partial(self):
685 self.check_partial(
686 "a+-b",
687 [
688 "a",
689 "a",
690 "a+",
691 "a+-",
692 "a+-b",
693 ]
694 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
696class UTF16ExTest(unittest.TestCase):
697
698 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000699 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000700
701 def test_bad_args(self):
702 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
703
704class ReadBufferTest(unittest.TestCase):
705
706 def test_array(self):
707 import array
708 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000709 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000710 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000711 )
712
713 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000714 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000715
716 def test_bad_args(self):
717 self.assertRaises(TypeError, codecs.readbuffer_encode)
718 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
719
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000720class UTF8SigTest(ReadTest):
721 encoding = "utf-8-sig"
722
723 def test_partial(self):
724 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200725 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000726 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000727 "",
728 "",
729 "", # First BOM has been read and skipped
730 "",
731 "",
732 "\ufeff", # Second BOM has been read and emitted
733 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000734 "\ufeff\x00", # First byte of encoded "\xff" read
735 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
736 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
737 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000738 "\ufeff\x00\xff\u07ff",
739 "\ufeff\x00\xff\u07ff",
740 "\ufeff\x00\xff\u07ff\u0800",
741 "\ufeff\x00\xff\u07ff\u0800",
742 "\ufeff\x00\xff\u07ff\u0800",
743 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200744 "\ufeff\x00\xff\u07ff\u0800\uffff",
745 "\ufeff\x00\xff\u07ff\u0800\uffff",
746 "\ufeff\x00\xff\u07ff\u0800\uffff",
747 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000748 ]
749 )
750
Thomas Wouters89f507f2006-12-13 04:49:30 +0000751 def test_bug1601501(self):
752 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000753 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000754
Walter Dörwald3abcb012007-04-16 22:10:50 +0000755 def test_bom(self):
756 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000757 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000758 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
759
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000760 def test_stream_bom(self):
761 unistring = "ABC\u00A1\u2200XYZ"
762 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
763
764 reader = codecs.getreader("utf-8-sig")
765 for sizehint in [None] + list(range(1, 11)) + \
766 [64, 128, 256, 512, 1024]:
767 istream = reader(io.BytesIO(bytestring))
768 ostream = io.StringIO()
769 while 1:
770 if sizehint is not None:
771 data = istream.read(sizehint)
772 else:
773 data = istream.read()
774
775 if not data:
776 break
777 ostream.write(data)
778
779 got = ostream.getvalue()
780 self.assertEqual(got, unistring)
781
782 def test_stream_bare(self):
783 unistring = "ABC\u00A1\u2200XYZ"
784 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
785
786 reader = codecs.getreader("utf-8-sig")
787 for sizehint in [None] + list(range(1, 11)) + \
788 [64, 128, 256, 512, 1024]:
789 istream = reader(io.BytesIO(bytestring))
790 ostream = io.StringIO()
791 while 1:
792 if sizehint is not None:
793 data = istream.read(sizehint)
794 else:
795 data = istream.read()
796
797 if not data:
798 break
799 ostream.write(data)
800
801 got = ostream.getvalue()
802 self.assertEqual(got, unistring)
803
804class EscapeDecodeTest(unittest.TestCase):
805 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200806 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000807
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000808class RecodingTest(unittest.TestCase):
809 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000810 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000811 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000812 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000813 f2.close()
814 # Python used to crash on this at exit because of a refcount
815 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000816
Martin v. Löwis2548c732003-04-18 10:39:54 +0000817# From RFC 3492
818punycode_testcases = [
819 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000820 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
821 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000822 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000824 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000825 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000826 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000827 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000828 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000829 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
831 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
832 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000833 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000834 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000835 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
836 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
837 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000838 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000839 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000840 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000841 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
842 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
843 "\u0939\u0948\u0902",
844 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000848 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
849 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850
851 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000852 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
853 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
854 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000855 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
856 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000857
858 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000859 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
860 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
861 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
862 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000863 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000864
865 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000866 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
867 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
868 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
869 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
870 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000871 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000872
873 # (K) Vietnamese:
874 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
875 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000876 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
877 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
878 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
879 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000880 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000881
Martin v. Löwis2548c732003-04-18 10:39:54 +0000882 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000883 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000884 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000885
Martin v. Löwis2548c732003-04-18 10:39:54 +0000886 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000887 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
888 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
889 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000890 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000891
892 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000893 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
894 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
895 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000896 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000897
898 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000899 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000900 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000901
902 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000903 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
904 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000905 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000906
907 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000908 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000909 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000910
911 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000912 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000913 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000914
915 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000916 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
917 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000918 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000919 ]
920
921for i in punycode_testcases:
922 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000923 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000924
925class PunycodeTest(unittest.TestCase):
926 def test_encode(self):
927 for uni, puny in punycode_testcases:
928 # Need to convert both strings to lower case, since
929 # some of the extended encodings use upper case, but our
930 # code produces only lower case. Converting just puny to
931 # lower is also insufficient, since some of the input characters
932 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000933 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000934 str(uni.encode("punycode"), "ascii").lower(),
935 str(puny, "ascii").lower()
936 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000937
938 def test_decode(self):
939 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000940 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000941 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000942 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000943
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000944class UnicodeInternalTest(unittest.TestCase):
945 def test_bug1251300(self):
946 # Decoding with unicode_internal used to not correctly handle "code
947 # points" above 0x10ffff on UCS-4 builds.
948 if sys.maxunicode > 0xffff:
949 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000950 (b"\x00\x10\xff\xff", "\U0010ffff"),
951 (b"\x00\x00\x01\x01", "\U00000101"),
952 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000953 ]
954 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000955 b"\x7f\xff\xff\xff",
956 b"\x80\x00\x00\x00",
957 b"\x81\x00\x00\x00",
958 b"\x00",
959 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000960 ]
961 for internal, uni in ok:
962 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000963 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000964 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000965 for internal in not_ok:
966 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000967 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000968 self.assertRaises(UnicodeDecodeError, internal.decode,
969 "unicode_internal")
970
971 def test_decode_error_attributes(self):
972 if sys.maxunicode > 0xffff:
973 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000974 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000975 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000976 self.assertEqual("unicode_internal", ex.encoding)
977 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
978 self.assertEqual(4, ex.start)
979 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000980 else:
981 self.fail()
982
983 def test_decode_callback(self):
984 if sys.maxunicode > 0xffff:
985 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
986 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000987 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000988 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
989 "ascii"),
990 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000991 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000992
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000993 def test_encode_length(self):
994 # Issue 3739
995 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000996 self.assertEqual(encoder("a")[1], 1)
997 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000998
Ezio Melottib3aedd42010-11-20 19:04:17 +0000999 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001000
Martin v. Löwis2548c732003-04-18 10:39:54 +00001001# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1002nameprep_tests = [
1003 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1005 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1006 b'\xb8\x8f\xef\xbb\xbf',
1007 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'CAFE',
1010 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 # 3.3 Case folding 8bit U+00DF (german sharp s).
1012 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xc3\x9f',
1014 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xc4\xb0',
1017 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001018 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xc5\x83\xcd\xba',
1020 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1022 # XXX: skip this as it fails in UCS-2 mode
1023 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1024 # 'telc\xe2\x88\x95kg\xcf\x83'),
1025 (None, None),
1026 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1028 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001030 (b'\xe1\xbe\xb7',
1031 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 # 3.9 Self-reverting case folding U+01F0 and normalization.
1033 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xc7\xb0',
1035 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xce\x90',
1038 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xce\xb0',
1041 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xe1\xba\x96',
1044 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xe1\xbd\x96',
1047 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b' ',
1050 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xc2\xa0',
1053 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xe2\x80\x80',
1059 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xe2\x80\x8b',
1062 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xe3\x80\x80',
1065 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'\x10\x7f',
1068 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001069 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 None),
1072 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 None),
1075 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xef\xbb\xbf',
1077 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 None),
1081 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001082 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 None),
1084 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001085 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 None),
1087 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 None),
1090 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 None),
1093 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001094 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 None),
1096 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001097 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 None),
1099 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001100 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 None),
1102 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001103 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 None),
1105 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001106 (b'\xcd\x81',
1107 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001109 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 None),
1111 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001112 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 None),
1114 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001115 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 None),
1117 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001118 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119 None),
1120 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001121 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 None),
1123 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001124 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125 None),
1126 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001127 (b'foo\xef\xb9\xb6bar',
1128 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001129 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001130 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 None),
1132 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001133 (b'\xd8\xa71\xd8\xa8',
1134 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001136 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001137 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001138 # None),
1139 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001140 # 3.44 Larger test (shrinking).
1141 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1143 b'\xaa\xce\xb0\xe2\x80\x80',
1144 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145 # 3.45 Larger test (expanding).
1146 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1148 b'\x80',
1149 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1150 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1151 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 ]
1153
1154
1155class NameprepTest(unittest.TestCase):
1156 def test_nameprep(self):
1157 from encodings.idna import nameprep
1158 for pos, (orig, prepped) in enumerate(nameprep_tests):
1159 if orig is None:
1160 # Skipped
1161 continue
1162 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001163 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164 if prepped is None:
1165 # Input contains prohibited characters
1166 self.assertRaises(UnicodeError, nameprep, orig)
1167 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001168 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001170 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001171 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001172 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001174class IDNACodecTest(unittest.TestCase):
1175 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001176 self.assertEqual(str(b"python.org", "idna"), "python.org")
1177 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1178 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1179 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180
1181 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001182 self.assertEqual("python.org".encode("idna"), b"python.org")
1183 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1184 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1185 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001186
Martin v. Löwis8b595142005-08-25 11:03:38 +00001187 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001188 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001189 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001190 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001191
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001193 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001194 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001195 "python.org"
1196 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001197 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001198 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001199 "python.org."
1200 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001201 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001202 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001203 "pyth\xf6n.org."
1204 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001205 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001206 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001207 "pyth\xf6n.org."
1208 )
1209
1210 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001211 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1212 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1213 self.assertEqual(decoder.decode(b"rg"), "")
1214 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001215
1216 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001217 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1218 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1219 self.assertEqual(decoder.decode(b"rg."), "org.")
1220 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001221
1222 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001223 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001224 b"".join(codecs.iterencode("python.org", "idna")),
1225 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001227 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001228 b"".join(codecs.iterencode("python.org.", "idna")),
1229 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001230 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001231 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001232 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1233 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001235 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001236 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1237 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001238 )
1239
1240 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001241 self.assertEqual(encoder.encode("\xe4x"), b"")
1242 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1243 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
1245 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001246 self.assertEqual(encoder.encode("\xe4x"), b"")
1247 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1248 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001249
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001250class CodecsModuleTest(unittest.TestCase):
1251
1252 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001253 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1254 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001255 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001256 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001257 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001258
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001259 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001260 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1261 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001262 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001263 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001264 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001266
1267 def test_register(self):
1268 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001269 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001270
1271 def test_lookup(self):
1272 self.assertRaises(TypeError, codecs.lookup)
1273 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001274 self.assertRaises(LookupError, codecs.lookup, " ")
1275
1276 def test_getencoder(self):
1277 self.assertRaises(TypeError, codecs.getencoder)
1278 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1279
1280 def test_getdecoder(self):
1281 self.assertRaises(TypeError, codecs.getdecoder)
1282 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1283
1284 def test_getreader(self):
1285 self.assertRaises(TypeError, codecs.getreader)
1286 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1287
1288 def test_getwriter(self):
1289 self.assertRaises(TypeError, codecs.getwriter)
1290 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001291
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001292 def test_lookup_issue1813(self):
1293 # Issue #1813: under Turkish locales, lookup of some codecs failed
1294 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001295 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001296 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1297 try:
1298 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1299 except locale.Error:
1300 # Unsupported locale on this system
1301 self.skipTest('test needs Turkish locale')
1302 c = codecs.lookup('ASCII')
1303 self.assertEqual(c.name, 'ascii')
1304
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001305class StreamReaderTest(unittest.TestCase):
1306
1307 def setUp(self):
1308 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001309 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001310
1311 def test_readlines(self):
1312 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001313 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001314
Thomas Wouters89f507f2006-12-13 04:49:30 +00001315class EncodedFileTest(unittest.TestCase):
1316
1317 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001318 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001319 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001320 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001321
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001322 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001323 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001324 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001325 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001326
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001327all_unicode_encodings = [
1328 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001329 "big5",
1330 "big5hkscs",
1331 "charmap",
1332 "cp037",
1333 "cp1006",
1334 "cp1026",
1335 "cp1140",
1336 "cp1250",
1337 "cp1251",
1338 "cp1252",
1339 "cp1253",
1340 "cp1254",
1341 "cp1255",
1342 "cp1256",
1343 "cp1257",
1344 "cp1258",
1345 "cp424",
1346 "cp437",
1347 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001348 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001349 "cp737",
1350 "cp775",
1351 "cp850",
1352 "cp852",
1353 "cp855",
1354 "cp856",
1355 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001356 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001357 "cp860",
1358 "cp861",
1359 "cp862",
1360 "cp863",
1361 "cp864",
1362 "cp865",
1363 "cp866",
1364 "cp869",
1365 "cp874",
1366 "cp875",
1367 "cp932",
1368 "cp949",
1369 "cp950",
1370 "euc_jis_2004",
1371 "euc_jisx0213",
1372 "euc_jp",
1373 "euc_kr",
1374 "gb18030",
1375 "gb2312",
1376 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001377 "hp_roman8",
1378 "hz",
1379 "idna",
1380 "iso2022_jp",
1381 "iso2022_jp_1",
1382 "iso2022_jp_2",
1383 "iso2022_jp_2004",
1384 "iso2022_jp_3",
1385 "iso2022_jp_ext",
1386 "iso2022_kr",
1387 "iso8859_1",
1388 "iso8859_10",
1389 "iso8859_11",
1390 "iso8859_13",
1391 "iso8859_14",
1392 "iso8859_15",
1393 "iso8859_16",
1394 "iso8859_2",
1395 "iso8859_3",
1396 "iso8859_4",
1397 "iso8859_5",
1398 "iso8859_6",
1399 "iso8859_7",
1400 "iso8859_8",
1401 "iso8859_9",
1402 "johab",
1403 "koi8_r",
1404 "koi8_u",
1405 "latin_1",
1406 "mac_cyrillic",
1407 "mac_greek",
1408 "mac_iceland",
1409 "mac_latin2",
1410 "mac_roman",
1411 "mac_turkish",
1412 "palmos",
1413 "ptcp154",
1414 "punycode",
1415 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001416 "shift_jis",
1417 "shift_jis_2004",
1418 "shift_jisx0213",
1419 "tis_620",
1420 "unicode_escape",
1421 "unicode_internal",
1422 "utf_16",
1423 "utf_16_be",
1424 "utf_16_le",
1425 "utf_7",
1426 "utf_8",
1427]
1428
1429if hasattr(codecs, "mbcs_encode"):
1430 all_unicode_encodings.append("mbcs")
1431
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001432# The following encoding is not tested, because it's not supposed
1433# to work:
1434# "undefined"
1435
1436# The following encodings don't work in stateful mode
1437broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001438 "punycode",
1439 "unicode_internal"
1440]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001441broken_incremental_coders = broken_unicode_with_streams + [
1442 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001443]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001444
Walter Dörwald3abcb012007-04-16 22:10:50 +00001445class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001446 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001447 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001448 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001449 name = codecs.lookup(encoding).name
1450 if encoding.endswith("_codec"):
1451 name += "_codec"
1452 elif encoding == "latin_1":
1453 name = "latin_1"
1454 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001455 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001456 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001457 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001458 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1459
1460 if encoding not in broken_unicode_with_streams:
1461 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001462 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001463 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001464 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001465 for c in s:
1466 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001467 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001468 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001469 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001470 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001471 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001472 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001473 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001474 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001475 decodedresult += reader.read()
1476 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1477
Thomas Wouters89f507f2006-12-13 04:49:30 +00001478 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479 # check incremental decoder/encoder (fetched via the Python
1480 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001481 try:
1482 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001483 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001484 except LookupError: # no IncrementalEncoder
1485 pass
1486 else:
1487 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001488 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001489 for c in s:
1490 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001491 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001492 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001493 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001494 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001495 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001496 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001497 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1498
1499 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001500 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001501 for c in s:
1502 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001504 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001505 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001506 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001507 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001508 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001509 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1510
1511 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001512 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001513 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1514
1515 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001516 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1517 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001518
Victor Stinner554f3f02010-06-16 23:33:54 +00001519 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001520 # check incremental decoder/encoder with errors argument
1521 try:
1522 encoder = codecs.getincrementalencoder(encoding)("ignore")
1523 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1524 except LookupError: # no IncrementalEncoder
1525 pass
1526 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001527 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001528 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001529 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001530 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1531
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001532 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001533 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001534 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001535 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1536
Walter Dörwald729c31f2005-03-14 19:06:30 +00001537 def test_seek(self):
1538 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001539 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001540 for encoding in all_unicode_encodings:
1541 if encoding == "idna": # FIXME: See SF bug #1163178
1542 continue
1543 if encoding in broken_unicode_with_streams:
1544 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001545 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001546 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001547 # Test that calling seek resets the internal codec state and buffers
1548 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001549 data = reader.read()
1550 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001551
Walter Dörwalde22d3392005-11-17 08:52:34 +00001552 def test_bad_decode_args(self):
1553 for encoding in all_unicode_encodings:
1554 decoder = codecs.getdecoder(encoding)
1555 self.assertRaises(TypeError, decoder)
1556 if encoding not in ("idna", "punycode"):
1557 self.assertRaises(TypeError, decoder, 42)
1558
1559 def test_bad_encode_args(self):
1560 for encoding in all_unicode_encodings:
1561 encoder = codecs.getencoder(encoding)
1562 self.assertRaises(TypeError, encoder)
1563
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001564 def test_encoding_map_type_initialized(self):
1565 from encodings import cp1140
1566 # This used to crash, we are only verifying there's no crash.
1567 table_type = type(cp1140.encoding_table)
1568 self.assertEqual(table_type, table_type)
1569
Walter Dörwald3abcb012007-04-16 22:10:50 +00001570 def test_decoder_state(self):
1571 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001573 for encoding in all_unicode_encodings:
1574 if encoding not in broken_incremental_coders:
1575 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1576 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1577
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001578class CharmapTest(unittest.TestCase):
1579 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001581 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001582 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001583 )
1584
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001585 self.assertRaises(UnicodeDecodeError,
1586 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1587 )
1588
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001589 self.assertRaises(UnicodeDecodeError,
1590 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
1591 )
1592
Ezio Melottib3aedd42010-11-20 19:04:17 +00001593 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001594 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001595 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001596 )
1597
Ezio Melottib3aedd42010-11-20 19:04:17 +00001598 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001599 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001600 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001601 )
1602
Ezio Melottib3aedd42010-11-20 19:04:17 +00001603 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001604 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001605 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001606 )
1607
Ezio Melottib3aedd42010-11-20 19:04:17 +00001608 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001609 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001610 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001611 )
1612
Guido van Rossum805365e2007-05-07 22:24:25 +00001613 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001614 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001615 codecs.charmap_decode(allbytes, "ignore", ""),
1616 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001617 )
1618
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001619 def test_decode_with_int2str_map(self):
1620 self.assertEqual(
1621 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1622 {0: 'a', 1: 'b', 2: 'c'}),
1623 ("abc", 3)
1624 )
1625
1626 self.assertEqual(
1627 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1628 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1629 ("AaBbCc", 3)
1630 )
1631
1632 self.assertEqual(
1633 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1634 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1635 ("\U0010FFFFbc", 3)
1636 )
1637
1638 self.assertEqual(
1639 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1640 {0: 'a', 1: 'b', 2: ''}),
1641 ("ab", 3)
1642 )
1643
1644 self.assertRaises(UnicodeDecodeError,
1645 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1646 {0: 'a', 1: 'b'}
1647 )
1648
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001649 self.assertRaises(UnicodeDecodeError,
1650 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1651 {0: 'a', 1: 'b', 2: None}
1652 )
1653
1654 # Issue #14850
1655 self.assertRaises(UnicodeDecodeError,
1656 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1657 {0: 'a', 1: 'b', 2: '\ufffe'}
1658 )
1659
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001660 self.assertEqual(
1661 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1662 {0: 'a', 1: 'b'}),
1663 ("ab\ufffd", 3)
1664 )
1665
1666 self.assertEqual(
1667 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1668 {0: 'a', 1: 'b', 2: None}),
1669 ("ab\ufffd", 3)
1670 )
1671
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001672 # Issue #14850
1673 self.assertEqual(
1674 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1675 {0: 'a', 1: 'b', 2: '\ufffe'}),
1676 ("ab\ufffd", 3)
1677 )
1678
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001679 self.assertEqual(
1680 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1681 {0: 'a', 1: 'b'}),
1682 ("ab", 3)
1683 )
1684
1685 self.assertEqual(
1686 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1687 {0: 'a', 1: 'b', 2: None}),
1688 ("ab", 3)
1689 )
1690
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001691 # Issue #14850
1692 self.assertEqual(
1693 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1694 {0: 'a', 1: 'b', 2: '\ufffe'}),
1695 ("ab", 3)
1696 )
1697
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001698 allbytes = bytes(range(256))
1699 self.assertEqual(
1700 codecs.charmap_decode(allbytes, "ignore", {}),
1701 ("", len(allbytes))
1702 )
1703
1704 def test_decode_with_int2int_map(self):
1705 a = ord('a')
1706 b = ord('b')
1707 c = ord('c')
1708
1709 self.assertEqual(
1710 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1711 {0: a, 1: b, 2: c}),
1712 ("abc", 3)
1713 )
1714
1715 # Issue #15379
1716 self.assertEqual(
1717 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1718 {0: 0x10FFFF, 1: b, 2: c}),
1719 ("\U0010FFFFbc", 3)
1720 )
1721
1722 self.assertRaises(TypeError,
1723 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1724 {0: 0x110000, 1: b, 2: c}
1725 )
1726
1727 self.assertRaises(UnicodeDecodeError,
1728 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1729 {0: a, 1: b},
1730 )
1731
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001732 self.assertRaises(UnicodeDecodeError,
1733 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1734 {0: a, 1: b, 2: 0xFFFE},
1735 )
1736
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001737 self.assertEqual(
1738 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1739 {0: a, 1: b}),
1740 ("ab\ufffd", 3)
1741 )
1742
1743 self.assertEqual(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001744 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1745 {0: a, 1: b, 2: 0xFFFE}),
1746 ("ab\ufffd", 3)
1747 )
1748
1749 self.assertEqual(
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001750 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1751 {0: a, 1: b}),
1752 ("ab", 3)
1753 )
1754
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02001755 self.assertEqual(
1756 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1757 {0: a, 1: b, 2: 0xFFFE}),
1758 ("ab", 3)
1759 )
1760
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001761
Thomas Wouters89f507f2006-12-13 04:49:30 +00001762class WithStmtTest(unittest.TestCase):
1763 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001764 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001765 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001766 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001767
1768 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001769 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001770 info = codecs.lookup("utf-8")
1771 with codecs.StreamReaderWriter(f, info.streamreader,
1772 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001773 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001774
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001775class TypesTest(unittest.TestCase):
1776 def test_decode_unicode(self):
1777 # Most decoders don't accept unicode input
1778 decoders = [
1779 codecs.utf_7_decode,
1780 codecs.utf_8_decode,
1781 codecs.utf_16_le_decode,
1782 codecs.utf_16_be_decode,
1783 codecs.utf_16_ex_decode,
1784 codecs.utf_32_decode,
1785 codecs.utf_32_le_decode,
1786 codecs.utf_32_be_decode,
1787 codecs.utf_32_ex_decode,
1788 codecs.latin_1_decode,
1789 codecs.ascii_decode,
1790 codecs.charmap_decode,
1791 ]
1792 if hasattr(codecs, "mbcs_decode"):
1793 decoders.append(codecs.mbcs_decode)
1794 for decoder in decoders:
1795 self.assertRaises(TypeError, decoder, "xxx")
1796
1797 def test_unicode_escape(self):
1798 # Escape-decoding an unicode string is supported ang gives the same
1799 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001800 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1801 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1802 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1803 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001804
Martin v. Löwis43c57782009-05-10 08:15:24 +00001805class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001806
1807 def test_utf8(self):
1808 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001809 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001810 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001811 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001812 b"foo\x80bar")
1813 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001814 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001815 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001816 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001817 b"\xed\xb0\x80")
1818
1819 def test_ascii(self):
1820 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001821 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001822 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001823 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001824 b"foo\x80bar")
1825
1826 def test_charmap(self):
1827 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001828 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001829 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001830 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001831 b"foo\xa5bar")
1832
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001833 def test_latin1(self):
1834 # Issue6373
1835 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1836 b"\xe4\xeb\xef\xf6\xfc")
1837
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001838
Victor Stinner3fed0872010-05-22 02:16:27 +00001839class BomTest(unittest.TestCase):
1840 def test_seek0(self):
1841 data = "1234567890"
1842 tests = ("utf-16",
1843 "utf-16-le",
1844 "utf-16-be",
1845 "utf-32",
1846 "utf-32-le",
1847 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001848 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001849 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001850 # Check if the BOM is written only once
1851 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001852 f.write(data)
1853 f.write(data)
1854 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001855 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001856 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001857 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001858
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001859 # Check that the BOM is written after a seek(0)
1860 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1861 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001862 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001863 f.seek(0)
1864 f.write(data)
1865 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001866 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001867
1868 # (StreamWriter) Check that the BOM is written after a seek(0)
1869 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1870 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001871 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001872 f.writer.seek(0)
1873 f.writer.write(data)
1874 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001875 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001876
1877 # Check that the BOM is not written after a seek() at a position
1878 # different than the start
1879 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1880 f.write(data)
1881 f.seek(f.tell())
1882 f.write(data)
1883 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001884 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001885
1886 # (StreamWriter) Check that the BOM is not written after a seek()
1887 # at a position different than the start
1888 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1889 f.writer.write(data)
1890 f.writer.seek(f.writer.tell())
1891 f.writer.write(data)
1892 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001893 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001894
Victor Stinner3fed0872010-05-22 02:16:27 +00001895
Georg Brandl02524622010-12-02 18:06:51 +00001896bytes_transform_encodings = [
1897 "base64_codec",
1898 "uu_codec",
1899 "quopri_codec",
1900 "hex_codec",
1901]
1902try:
1903 import zlib
1904except ImportError:
1905 pass
1906else:
1907 bytes_transform_encodings.append("zlib_codec")
1908try:
1909 import bz2
1910except ImportError:
1911 pass
1912else:
1913 bytes_transform_encodings.append("bz2_codec")
1914
1915class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001916
Georg Brandl02524622010-12-02 18:06:51 +00001917 def test_basics(self):
1918 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001919 for encoding in bytes_transform_encodings:
1920 # generic codecs interface
1921 (o, size) = codecs.getencoder(encoding)(binput)
1922 self.assertEqual(size, len(binput))
1923 (i, size) = codecs.getdecoder(encoding)(o)
1924 self.assertEqual(size, len(o))
1925 self.assertEqual(i, binput)
1926
Georg Brandl02524622010-12-02 18:06:51 +00001927 def test_read(self):
1928 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001929 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001930 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1931 sout = reader.read()
1932 self.assertEqual(sout, b"\x80")
1933
1934 def test_readline(self):
1935 for encoding in bytes_transform_encodings:
1936 if encoding in ['uu_codec', 'zlib_codec']:
1937 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001938 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001939 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1940 sout = reader.readline()
1941 self.assertEqual(sout, b"\x80")
1942
1943
Fred Drake2e2be372001-09-20 21:33:42 +00001944def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001945 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001946 UTF32Test,
1947 UTF32LETest,
1948 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001949 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001950 UTF16LETest,
1951 UTF16BETest,
1952 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001953 UTF8SigTest,
Ezio Melotti26ed2342013-01-11 05:54:57 +02001954 EscapeDecodeTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001955 UTF7Test,
1956 UTF16ExTest,
1957 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001958 RecodingTest,
1959 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001960 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001961 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001962 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001963 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001964 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001965 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001966 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001967 CharmapTest,
1968 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001969 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001970 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001971 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001972 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001973 )
Fred Drake2e2be372001-09-20 21:33:42 +00001974
1975
1976if __name__ == "__main__":
1977 test_main()