blob: acf9f64b6b932aaefeca315cd82933f44f1ce97c [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Barry Warsaw04f357c2002-07-23 19:04:11 +00002import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00005import sys, _testcapi, io
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Walter Dörwald69652032004-09-07 20:24:22 +00007class Queue(object):
8 """
9 queue: write bytes at one end, read bytes from the other end
10 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000011 def __init__(self, buffer):
12 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000013
14 def write(self, chars):
15 self._buffer += chars
16
17 def read(self, size=-1):
18 if size<0:
19 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000020 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000021 return s
22 else:
23 s = self._buffer[:size]
24 self._buffer = self._buffer[size:]
25 return s
26
Walter Dörwald3abcb012007-04-16 22:10:50 +000027class MixInCheckStateHandling:
28 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000029 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000030 d = codecs.getincrementaldecoder(encoding)()
31 part1 = d.decode(s[:i])
32 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000033 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000034 # Check that the condition stated in the documentation for
35 # IncrementalDecoder.getstate() holds
36 if not state[1]:
37 # reset decoder to the default state without anything buffered
38 d.setstate((state[0][:0], 0))
39 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000041 # The decoder must return to the same state
42 self.assertEqual(state, d.getstate())
43 # Create a new decoder and set it to the state
44 # we extracted from the old one
45 d = codecs.getincrementaldecoder(encoding)()
46 d.setstate(state)
47 part2 = d.decode(s[i:], True)
48 self.assertEqual(u, part1+part2)
49
50 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000051 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000052 d = codecs.getincrementalencoder(encoding)()
53 part1 = d.encode(u[:i])
54 state = d.getstate()
55 d = codecs.getincrementalencoder(encoding)()
56 d.setstate(state)
57 part2 = d.encode(u[i:], True)
58 self.assertEqual(s, part1+part2)
59
60class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000061 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000062 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000063 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000064 # the StreamReader and check that the results equal the appropriate
65 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000066 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +000067 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000068 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000069 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000070 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000071 result += r.read()
72 self.assertEqual(result, partialresult)
73 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000074 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000075 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000076
Thomas Woutersa9773292006-04-21 09:43:23 +000077 # do the check again, this time using a incremental decoder
78 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000079 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000080 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000081 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000082 self.assertEqual(result, partialresult)
83 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 self.assertEqual(d.decode(b"", True), "")
85 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000086
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000088 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000089 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000090 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000091 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000092 self.assertEqual(result, partialresult)
93 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 self.assertEqual(d.decode(b"", True), "")
95 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000096
97 # check iterdecode()
98 encoded = input.encode(self.encoding)
99 self.assertEqual(
100 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000102 )
103
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000104 def test_readline(self):
105 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000106 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 return codecs.getreader(self.encoding)(stream)
108
Walter Dörwaldca199432006-03-06 22:39:12 +0000109 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 reader = getreader(input)
111 lines = []
112 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000113 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 if not line:
115 break
116 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000117 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000118
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000119 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
120 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
121 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 self.assertEqual(readalllines(s, True), sexpected)
123 self.assertEqual(readalllines(s, False), sexpectednoends)
124 self.assertEqual(readalllines(s, True, 10), sexpected)
125 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000126
127 # Test long lines (multiple calls to read() in readline())
128 vw = []
129 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000130 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
131 vw.append((i*200)*"\3042" + lineend)
132 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000133 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
134 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
135
136 # Test lines where the first read might end with \r, so the
137 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000138 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000139 for lineend in "\n \r\n \r \u2028".split():
140 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000141 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000142 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000143 self.assertEqual(
144 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000145 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 )
147 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000148 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 self.assertEqual(
150 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000151 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 )
153
154 def test_bug1175396(self):
155 s = [
156 '<%!--===================================================\r\n',
157 ' BLOG index page: show recent articles,\r\n',
158 ' today\'s articles, or articles of a specific date.\r\n',
159 '========================================================--%>\r\n',
160 '<%@inputencoding="ISO-8859-1"%>\r\n',
161 '<%@pagetemplate=TEMPLATE.y%>\r\n',
162 '<%@import=import frog.util, frog%>\r\n',
163 '<%@import=import frog.objects%>\r\n',
164 '<%@import=from frog.storageerrors import StorageError%>\r\n',
165 '<%\r\n',
166 '\r\n',
167 'import logging\r\n',
168 'log=logging.getLogger("Snakelets.logger")\r\n',
169 '\r\n',
170 '\r\n',
171 'user=self.SessionCtx.user\r\n',
172 'storageEngine=self.SessionCtx.storageEngine\r\n',
173 '\r\n',
174 '\r\n',
175 'def readArticlesFromDate(date, count=None):\r\n',
176 ' entryids=storageEngine.listBlogEntries(date)\r\n',
177 ' entryids.reverse() # descending\r\n',
178 ' if count:\r\n',
179 ' entryids=entryids[:count]\r\n',
180 ' try:\r\n',
181 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
182 ' except StorageError,x:\r\n',
183 ' log.error("Error loading articles: "+str(x))\r\n',
184 ' self.abort("cannot load articles")\r\n',
185 '\r\n',
186 'showdate=None\r\n',
187 '\r\n',
188 'arg=self.Request.getArg()\r\n',
189 'if arg=="today":\r\n',
190 ' #-------------------- TODAY\'S ARTICLES\r\n',
191 ' self.write("<h2>Today\'s articles</h2>")\r\n',
192 ' showdate = frog.util.isodatestr() \r\n',
193 ' entries = readArticlesFromDate(showdate)\r\n',
194 'elif arg=="active":\r\n',
195 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
196 ' self.Yredirect("active.y")\r\n',
197 'elif arg=="login":\r\n',
198 ' #-------------------- LOGIN PAGE redirect\r\n',
199 ' self.Yredirect("login.y")\r\n',
200 'elif arg=="date":\r\n',
201 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
202 ' showdate = self.Request.getParameter("date")\r\n',
203 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
204 ' entries = readArticlesFromDate(showdate)\r\n',
205 'else:\r\n',
206 ' #-------------------- RECENT ARTICLES\r\n',
207 ' self.write("<h2>Recent articles</h2>")\r\n',
208 ' dates=storageEngine.listBlogEntryDates()\r\n',
209 ' if dates:\r\n',
210 ' entries=[]\r\n',
211 ' SHOWAMOUNT=10\r\n',
212 ' for showdate in dates:\r\n',
213 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
214 ' if len(entries)>=SHOWAMOUNT:\r\n',
215 ' break\r\n',
216 ' \r\n',
217 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000218 stream = io.BytesIO("".join(s).encode(self.encoding))
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000219 reader = codecs.getreader(self.encoding)(stream)
220 for (i, line) in enumerate(reader):
221 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000222
223 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000224 q = Queue(b"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225 writer = codecs.getwriter(self.encoding)(q)
226 reader = codecs.getreader(self.encoding)(q)
227
228 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000229 writer.write("foo\r")
230 self.assertEqual(reader.readline(keepends=False), "foo")
231 writer.write("\nbar\r")
232 self.assertEqual(reader.readline(keepends=False), "")
233 self.assertEqual(reader.readline(keepends=False), "bar")
234 writer.write("baz")
235 self.assertEqual(reader.readline(keepends=False), "baz")
236 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000237
238 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000239 writer.write("foo\r")
240 self.assertEqual(reader.readline(keepends=True), "foo\r")
241 writer.write("\nbar\r")
242 self.assertEqual(reader.readline(keepends=True), "\n")
243 self.assertEqual(reader.readline(keepends=True), "bar\r")
244 writer.write("baz")
245 self.assertEqual(reader.readline(keepends=True), "baz")
246 self.assertEqual(reader.readline(keepends=True), "")
247 writer.write("foo\r\n")
248 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249
Walter Dörwald9fa09462005-01-10 12:01:39 +0000250 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000251 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
252 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
253 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000254
255 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000256 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257 reader = codecs.getreader(self.encoding)(stream)
258 self.assertEqual(reader.readline(), s1)
259 self.assertEqual(reader.readline(), s2)
260 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000261 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000262
263 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
265 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
266 s3 = "stillokay:bbbbxx\r\n"
267 s4 = "broken!!!!badbad\r\n"
268 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000269
270 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000271 stream = io.BytesIO(s)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272 reader = codecs.getreader(self.encoding)(stream)
273 self.assertEqual(reader.readline(), s1)
274 self.assertEqual(reader.readline(), s2)
275 self.assertEqual(reader.readline(), s3)
276 self.assertEqual(reader.readline(), s4)
277 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
Walter Dörwald41980ca2007-08-16 21:55:45 +0000280class UTF32Test(ReadTest):
281 encoding = "utf-32"
282
283 spamle = (b'\xff\xfe\x00\x00'
284 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
285 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
286 spambe = (b'\x00\x00\xfe\xff'
287 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
288 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
289
290 def test_only_one_bom(self):
291 _,_,reader,writer = codecs.lookup(self.encoding)
292 # encode some stream
293 s = io.BytesIO()
294 f = writer(s)
295 f.write("spam")
296 f.write("spam")
297 d = s.getvalue()
298 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000299 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000300 # try to read it back
301 s = io.BytesIO(d)
302 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000303 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000304
305 def test_badbom(self):
306 s = io.BytesIO(4*b"\xff")
307 f = codecs.getreader(self.encoding)(s)
308 self.assertRaises(UnicodeError, f.read)
309
310 s = io.BytesIO(8*b"\xff")
311 f = codecs.getreader(self.encoding)(s)
312 self.assertRaises(UnicodeError, f.read)
313
314 def test_partial(self):
315 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200316 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 [
318 "", # first byte of BOM read
319 "", # second byte of BOM read
320 "", # third byte of BOM read
321 "", # fourth byte of BOM read => byteorder known
322 "",
323 "",
324 "",
325 "\x00",
326 "\x00",
327 "\x00",
328 "\x00",
329 "\x00\xff",
330 "\x00\xff",
331 "\x00\xff",
332 "\x00\xff",
333 "\x00\xff\u0100",
334 "\x00\xff\u0100",
335 "\x00\xff\u0100",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200338 "\x00\xff\u0100\uffff",
339 "\x00\xff\u0100\uffff",
340 "\x00\xff\u0100\uffff",
341 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000342 ]
343 )
344
Georg Brandl791f4e12009-09-17 11:41:24 +0000345 def test_handlers(self):
346 self.assertEqual(('\ufffd', 1),
347 codecs.utf_32_decode(b'\x01', 'replace', True))
348 self.assertEqual(('', 1),
349 codecs.utf_32_decode(b'\x01', 'ignore', True))
350
Walter Dörwald41980ca2007-08-16 21:55:45 +0000351 def test_errors(self):
352 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
353 b"\xff", "strict", True)
354
355 def test_decoder_state(self):
356 self.check_state_handling_decode(self.encoding,
357 "spamspam", self.spamle)
358 self.check_state_handling_decode(self.encoding,
359 "spamspam", self.spambe)
360
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
365 self.assertEqual('\U00010000' * 1024,
366 codecs.utf_32_decode(encoded_le)[0])
367 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
368 self.assertEqual('\U00010000' * 1024,
369 codecs.utf_32_decode(encoded_be)[0])
370
Walter Dörwald41980ca2007-08-16 21:55:45 +0000371class UTF32LETest(ReadTest):
372 encoding = "utf-32-le"
373
374 def test_partial(self):
375 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200376 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000377 [
378 "",
379 "",
380 "",
381 "\x00",
382 "\x00",
383 "\x00",
384 "\x00",
385 "\x00\xff",
386 "\x00\xff",
387 "\x00\xff",
388 "\x00\xff",
389 "\x00\xff\u0100",
390 "\x00\xff\u0100",
391 "\x00\xff\u0100",
392 "\x00\xff\u0100",
393 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200394 "\x00\xff\u0100\uffff",
395 "\x00\xff\u0100\uffff",
396 "\x00\xff\u0100\uffff",
397 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000398 ]
399 )
400
401 def test_simple(self):
402 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
403
404 def test_errors(self):
405 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
406 b"\xff", "strict", True)
407
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000408 def test_issue8941(self):
409 # Issue #8941: insufficient result allocation when decoding into
410 # surrogate pairs on UCS-2 builds.
411 encoded = b'\x00\x00\x01\x00' * 1024
412 self.assertEqual('\U00010000' * 1024,
413 codecs.utf_32_le_decode(encoded)[0])
414
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415class UTF32BETest(ReadTest):
416 encoding = "utf-32-be"
417
418 def test_partial(self):
419 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200420 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000421 [
422 "",
423 "",
424 "",
425 "\x00",
426 "\x00",
427 "\x00",
428 "\x00",
429 "\x00\xff",
430 "\x00\xff",
431 "\x00\xff",
432 "\x00\xff",
433 "\x00\xff\u0100",
434 "\x00\xff\u0100",
435 "\x00\xff\u0100",
436 "\x00\xff\u0100",
437 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200438 "\x00\xff\u0100\uffff",
439 "\x00\xff\u0100\uffff",
440 "\x00\xff\u0100\uffff",
441 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000442 ]
443 )
444
445 def test_simple(self):
446 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
447
448 def test_errors(self):
449 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
450 b"\xff", "strict", True)
451
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000452 def test_issue8941(self):
453 # Issue #8941: insufficient result allocation when decoding into
454 # surrogate pairs on UCS-2 builds.
455 encoded = b'\x00\x01\x00\x00' * 1024
456 self.assertEqual('\U00010000' * 1024,
457 codecs.utf_32_be_decode(encoded)[0])
458
459
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000460class UTF16Test(ReadTest):
461 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000462
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000463 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
464 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000465
466 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000467 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000468 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000469 s = io.BytesIO()
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000471 f.write("spam")
472 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 d = s.getvalue()
474 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000475 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000476 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000477 s = io.BytesIO(d)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000479 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000480
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000481 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(b"\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000483 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000484 self.assertRaises(UnicodeError, f.read)
485
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000486 s = io.BytesIO(b"\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000487 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000488 self.assertRaises(UnicodeError, f.read)
489
Walter Dörwald69652032004-09-07 20:24:22 +0000490 def test_partial(self):
491 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200492 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000493 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000494 "", # first byte of BOM read
495 "", # second byte of BOM read => byteorder known
496 "",
497 "\x00",
498 "\x00",
499 "\x00\xff",
500 "\x00\xff",
501 "\x00\xff\u0100",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200504 "\x00\xff\u0100\uffff",
505 "\x00\xff\u0100\uffff",
506 "\x00\xff\u0100\uffff",
507 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000508 ]
509 )
510
Georg Brandl791f4e12009-09-17 11:41:24 +0000511 def test_handlers(self):
512 self.assertEqual(('\ufffd', 1),
513 codecs.utf_16_decode(b'\x01', 'replace', True))
514 self.assertEqual(('', 1),
515 codecs.utf_16_decode(b'\x01', 'ignore', True))
516
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000518 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000519 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000520
521 def test_decoder_state(self):
522 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000523 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000524 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000525 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000526
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000527 def test_bug691291(self):
528 # Files are always opened in binary mode, even if no binary mode was
529 # specified. This means that no automatic conversion of '\n' is done
530 # on reading and writing.
531 s1 = 'Hello\r\nworld\r\n'
532
533 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200534 self.addCleanup(support.unlink, support.TESTFN)
535 with open(support.TESTFN, 'wb') as fp:
536 fp.write(s)
537 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
538 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000539
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000540class UTF16LETest(ReadTest):
541 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000542
543 def test_partial(self):
544 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200545 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000546 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000547 "",
548 "\x00",
549 "\x00",
550 "\x00\xff",
551 "\x00\xff",
552 "\x00\xff\u0100",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200555 "\x00\xff\u0100\uffff",
556 "\x00\xff\u0100\uffff",
557 "\x00\xff\u0100\uffff",
558 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000559 ]
560 )
561
Walter Dörwalde22d3392005-11-17 08:52:34 +0000562 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200563 tests = [
564 (b'\xff', '\ufffd'),
565 (b'A\x00Z', 'A\ufffd'),
566 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
567 (b'\x00\xd8', '\ufffd'),
568 (b'\x00\xd8A', '\ufffd'),
569 (b'\x00\xd8A\x00', '\ufffdA'),
570 (b'\x00\xdcA\x00', '\ufffdA'),
571 ]
572 for raw, expected in tests:
573 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
574 raw, 'strict', True)
575 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000576
Victor Stinner53a9dd72010-12-08 22:25:45 +0000577 def test_nonbmp(self):
578 self.assertEqual("\U00010203".encode(self.encoding),
579 b'\x00\xd8\x03\xde')
580 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
581 "\U00010203")
582
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583class UTF16BETest(ReadTest):
584 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000585
586 def test_partial(self):
587 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200588 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000589 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "",
591 "\x00",
592 "\x00",
593 "\x00\xff",
594 "\x00\xff",
595 "\x00\xff\u0100",
596 "\x00\xff\u0100",
597 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200598 "\x00\xff\u0100\uffff",
599 "\x00\xff\u0100\uffff",
600 "\x00\xff\u0100\uffff",
601 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000602 ]
603 )
604
Walter Dörwalde22d3392005-11-17 08:52:34 +0000605 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200606 tests = [
607 (b'\xff', '\ufffd'),
608 (b'\x00A\xff', 'A\ufffd'),
609 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
610 (b'\xd8\x00', '\ufffd'),
611 (b'\xd8\x00\xdc', '\ufffd'),
612 (b'\xd8\x00\x00A', '\ufffdA'),
613 (b'\xdc\x00\x00A', '\ufffdA'),
614 ]
615 for raw, expected in tests:
616 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
617 raw, 'strict', True)
618 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000619
Victor Stinner53a9dd72010-12-08 22:25:45 +0000620 def test_nonbmp(self):
621 self.assertEqual("\U00010203".encode(self.encoding),
622 b'\xd8\x00\xde\x03')
623 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
624 "\U00010203")
625
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000626class UTF8Test(ReadTest):
627 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000628
629 def test_partial(self):
630 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200631 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000632 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000633 "\x00",
634 "\x00",
635 "\x00\xff",
636 "\x00\xff",
637 "\x00\xff\u07ff",
638 "\x00\xff\u07ff",
639 "\x00\xff\u07ff",
640 "\x00\xff\u07ff\u0800",
641 "\x00\xff\u07ff\u0800",
642 "\x00\xff\u07ff\u0800",
643 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200644 "\x00\xff\u07ff\u0800\uffff",
645 "\x00\xff\u07ff\u0800\uffff",
646 "\x00\xff\u07ff\u0800\uffff",
647 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000648 ]
649 )
650
Walter Dörwald3abcb012007-04-16 22:10:50 +0000651 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000652 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000653 self.check_state_handling_decode(self.encoding,
654 u, u.encode(self.encoding))
655
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000656 def test_lone_surrogates(self):
657 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
658 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000659 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
660 b'[\\udc80]')
661 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
662 b'[&#56448;]')
663 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
664 b'[\x80]')
665 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
666 b'[]')
667 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
668 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000669
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000670 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000671 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
672 b"abc\xed\xa0\x80def")
673 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
674 "abc\ud800def")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000675 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700676 with self.assertRaises(UnicodeDecodeError):
677 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200678 with self.assertRaises(UnicodeDecodeError):
679 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000680
Walter Dörwalde22d3392005-11-17 08:52:34 +0000681class UTF7Test(ReadTest):
682 encoding = "utf-7"
683
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000684 def test_partial(self):
685 self.check_partial(
686 "a+-b",
687 [
688 "a",
689 "a",
690 "a+",
691 "a+-",
692 "a+-b",
693 ]
694 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000695
696class UTF16ExTest(unittest.TestCase):
697
698 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000699 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000700
701 def test_bad_args(self):
702 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
703
704class ReadBufferTest(unittest.TestCase):
705
706 def test_array(self):
707 import array
708 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000709 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000710 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000711 )
712
713 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000714 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000715
716 def test_bad_args(self):
717 self.assertRaises(TypeError, codecs.readbuffer_encode)
718 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
719
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000720class UTF8SigTest(ReadTest):
721 encoding = "utf-8-sig"
722
723 def test_partial(self):
724 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200725 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000726 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000727 "",
728 "",
729 "", # First BOM has been read and skipped
730 "",
731 "",
732 "\ufeff", # Second BOM has been read and emitted
733 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000734 "\ufeff\x00", # First byte of encoded "\xff" read
735 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
736 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
737 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000738 "\ufeff\x00\xff\u07ff",
739 "\ufeff\x00\xff\u07ff",
740 "\ufeff\x00\xff\u07ff\u0800",
741 "\ufeff\x00\xff\u07ff\u0800",
742 "\ufeff\x00\xff\u07ff\u0800",
743 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200744 "\ufeff\x00\xff\u07ff\u0800\uffff",
745 "\ufeff\x00\xff\u07ff\u0800\uffff",
746 "\ufeff\x00\xff\u07ff\u0800\uffff",
747 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000748 ]
749 )
750
Thomas Wouters89f507f2006-12-13 04:49:30 +0000751 def test_bug1601501(self):
752 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000753 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000754
Walter Dörwald3abcb012007-04-16 22:10:50 +0000755 def test_bom(self):
756 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000757 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000758 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
759
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000760 def test_stream_bom(self):
761 unistring = "ABC\u00A1\u2200XYZ"
762 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
763
764 reader = codecs.getreader("utf-8-sig")
765 for sizehint in [None] + list(range(1, 11)) + \
766 [64, 128, 256, 512, 1024]:
767 istream = reader(io.BytesIO(bytestring))
768 ostream = io.StringIO()
769 while 1:
770 if sizehint is not None:
771 data = istream.read(sizehint)
772 else:
773 data = istream.read()
774
775 if not data:
776 break
777 ostream.write(data)
778
779 got = ostream.getvalue()
780 self.assertEqual(got, unistring)
781
782 def test_stream_bare(self):
783 unistring = "ABC\u00A1\u2200XYZ"
784 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
785
786 reader = codecs.getreader("utf-8-sig")
787 for sizehint in [None] + list(range(1, 11)) + \
788 [64, 128, 256, 512, 1024]:
789 istream = reader(io.BytesIO(bytestring))
790 ostream = io.StringIO()
791 while 1:
792 if sizehint is not None:
793 data = istream.read(sizehint)
794 else:
795 data = istream.read()
796
797 if not data:
798 break
799 ostream.write(data)
800
801 got = ostream.getvalue()
802 self.assertEqual(got, unistring)
803
804class EscapeDecodeTest(unittest.TestCase):
805 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200806 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000807
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000808class RecodingTest(unittest.TestCase):
809 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000810 f = io.BytesIO()
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000811 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000812 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000813 f2.close()
814 # Python used to crash on this at exit because of a refcount
815 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000816
Martin v. Löwis2548c732003-04-18 10:39:54 +0000817# From RFC 3492
818punycode_testcases = [
819 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000820 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
821 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000822 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000824 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000825 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000826 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000827 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000828 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000829 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
831 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
832 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000833 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000834 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000835 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
836 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
837 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000838 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000839 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000840 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000841 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
842 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
843 "\u0939\u0948\u0902",
844 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000845
846 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000848 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
849 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850
851 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000852 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
853 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
854 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000855 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
856 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000857
858 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000859 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
860 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
861 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
862 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000863 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000864
865 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000866 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
867 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
868 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
869 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
870 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000871 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000872
873 # (K) Vietnamese:
874 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
875 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000876 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
877 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
878 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
879 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000880 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000881
Martin v. Löwis2548c732003-04-18 10:39:54 +0000882 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000883 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000884 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000885
Martin v. Löwis2548c732003-04-18 10:39:54 +0000886 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000887 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
888 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
889 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000890 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000891
892 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000893 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
894 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
895 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000896 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000897
898 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000899 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000900 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000901
902 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000903 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
904 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000905 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000906
907 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000908 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000909 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000910
911 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000912 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000913 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000914
915 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000916 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
917 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000918 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000919 ]
920
921for i in punycode_testcases:
922 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000923 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000924
925class PunycodeTest(unittest.TestCase):
926 def test_encode(self):
927 for uni, puny in punycode_testcases:
928 # Need to convert both strings to lower case, since
929 # some of the extended encodings use upper case, but our
930 # code produces only lower case. Converting just puny to
931 # lower is also insufficient, since some of the input characters
932 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000933 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000934 str(uni.encode("punycode"), "ascii").lower(),
935 str(puny, "ascii").lower()
936 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000937
938 def test_decode(self):
939 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000940 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000941 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000942 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000943
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000944class UnicodeInternalTest(unittest.TestCase):
945 def test_bug1251300(self):
946 # Decoding with unicode_internal used to not correctly handle "code
947 # points" above 0x10ffff on UCS-4 builds.
948 if sys.maxunicode > 0xffff:
949 ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000950 (b"\x00\x10\xff\xff", "\U0010ffff"),
951 (b"\x00\x00\x01\x01", "\U00000101"),
952 (b"", ""),
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000953 ]
954 not_ok = [
Walter Dörwald092a2252007-06-07 11:26:16 +0000955 b"\x7f\xff\xff\xff",
956 b"\x80\x00\x00\x00",
957 b"\x81\x00\x00\x00",
958 b"\x00",
959 b"\x00\x00\x00\x00\x00",
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000960 ]
961 for internal, uni in ok:
962 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000963 internal = bytes(reversed(internal))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000964 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000965 for internal in not_ok:
966 if sys.byteorder == "little":
Walter Dörwald092a2252007-06-07 11:26:16 +0000967 internal = bytes(reversed(internal))
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000968 self.assertRaises(UnicodeDecodeError, internal.decode,
969 "unicode_internal")
970
971 def test_decode_error_attributes(self):
972 if sys.maxunicode > 0xffff:
973 try:
Walter Dörwald092a2252007-06-07 11:26:16 +0000974 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Guido van Rossumb940e112007-01-10 16:19:56 +0000975 except UnicodeDecodeError as ex:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000976 self.assertEqual("unicode_internal", ex.encoding)
977 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
978 self.assertEqual(4, ex.start)
979 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000980 else:
981 self.fail()
982
983 def test_decode_callback(self):
984 if sys.maxunicode > 0xffff:
985 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
986 decoder = codecs.getdecoder("unicode_internal")
Guido van Rossum98297ee2007-11-06 21:34:58 +0000987 ab = "ab".encode("unicode_internal").decode()
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000988 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
989 "ascii"),
990 "UnicodeInternalTest")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000991 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000992
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000993 def test_encode_length(self):
994 # Issue 3739
995 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000996 self.assertEqual(encoder("a")[1], 1)
997 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000998
Ezio Melottib3aedd42010-11-20 19:04:17 +0000999 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001000
Martin v. Löwis2548c732003-04-18 10:39:54 +00001001# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1002nameprep_tests = [
1003 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1005 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1006 b'\xb8\x8f\xef\xbb\xbf',
1007 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001009 (b'CAFE',
1010 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001011 # 3.3 Case folding 8bit U+00DF (german sharp s).
1012 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xc3\x9f',
1014 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\xc4\xb0',
1017 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001018 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xc5\x83\xcd\xba',
1020 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001021 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1022 # XXX: skip this as it fails in UCS-2 mode
1023 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1024 # 'telc\xe2\x88\x95kg\xcf\x83'),
1025 (None, None),
1026 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001027 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1028 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001030 (b'\xe1\xbe\xb7',
1031 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 # 3.9 Self-reverting case folding U+01F0 and normalization.
1033 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xc7\xb0',
1035 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xce\x90',
1038 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001039 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xce\xb0',
1041 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001042 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xe1\xba\x96',
1044 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001045 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xe1\xbd\x96',
1047 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001048 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b' ',
1050 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001051 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xc2\xa0',
1053 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001054 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001056 None),
1057 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xe2\x80\x80',
1059 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001060 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xe2\x80\x8b',
1062 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001063 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xe3\x80\x80',
1065 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001066 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'\x10\x7f',
1068 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001069 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 None),
1072 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 None),
1075 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xef\xbb\xbf',
1077 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 None),
1081 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001082 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001083 None),
1084 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001085 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001086 None),
1087 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001088 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 None),
1090 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 None),
1093 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001094 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001095 None),
1096 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001097 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 None),
1099 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001100 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 None),
1102 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001103 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 None),
1105 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001106 (b'\xcd\x81',
1107 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001109 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110 None),
1111 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001112 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 None),
1114 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001115 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 None),
1117 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001118 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119 None),
1120 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001121 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 None),
1123 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001124 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125 None),
1126 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001127 (b'foo\xef\xb9\xb6bar',
1128 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001129 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001130 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 None),
1132 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001133 (b'\xd8\xa71\xd8\xa8',
1134 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001135 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001136 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001137 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001138 # None),
1139 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001140 # 3.44 Larger test (shrinking).
1141 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1143 b'\xaa\xce\xb0\xe2\x80\x80',
1144 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145 # 3.45 Larger test (expanding).
1146 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1148 b'\x80',
1149 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1150 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1151 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 ]
1153
1154
1155class NameprepTest(unittest.TestCase):
1156 def test_nameprep(self):
1157 from encodings.idna import nameprep
1158 for pos, (orig, prepped) in enumerate(nameprep_tests):
1159 if orig is None:
1160 # Skipped
1161 continue
1162 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001163 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164 if prepped is None:
1165 # Input contains prohibited characters
1166 self.assertRaises(UnicodeError, nameprep, orig)
1167 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001168 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001170 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001171 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001172 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001174class IDNACodecTest(unittest.TestCase):
1175 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001176 self.assertEqual(str(b"python.org", "idna"), "python.org")
1177 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1178 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1179 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001180
1181 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001182 self.assertEqual("python.org".encode("idna"), b"python.org")
1183 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1184 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1185 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001186
Martin v. Löwis8b595142005-08-25 11:03:38 +00001187 def test_stream(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001188 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001189 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001190 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001191
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001192 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001193 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001194 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001195 "python.org"
1196 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001197 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001198 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001199 "python.org."
1200 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001201 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001202 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001203 "pyth\xf6n.org."
1204 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001205 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001206 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001207 "pyth\xf6n.org."
1208 )
1209
1210 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001211 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1212 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1213 self.assertEqual(decoder.decode(b"rg"), "")
1214 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001215
1216 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001217 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1218 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1219 self.assertEqual(decoder.decode(b"rg."), "org.")
1220 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001221
1222 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001223 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001224 b"".join(codecs.iterencode("python.org", "idna")),
1225 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001226 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001227 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001228 b"".join(codecs.iterencode("python.org.", "idna")),
1229 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001230 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001231 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001232 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1233 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001235 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001236 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1237 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001238 )
1239
1240 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001241 self.assertEqual(encoder.encode("\xe4x"), b"")
1242 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1243 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001244
1245 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001246 self.assertEqual(encoder.encode("\xe4x"), b"")
1247 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1248 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001249
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001250class CodecsModuleTest(unittest.TestCase):
1251
1252 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001253 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1254 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001255 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001256 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001257 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001258
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001259 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001260 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1261 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001262 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001263 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001264 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001265 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001266
1267 def test_register(self):
1268 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001269 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001270
1271 def test_lookup(self):
1272 self.assertRaises(TypeError, codecs.lookup)
1273 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001274 self.assertRaises(LookupError, codecs.lookup, " ")
1275
1276 def test_getencoder(self):
1277 self.assertRaises(TypeError, codecs.getencoder)
1278 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1279
1280 def test_getdecoder(self):
1281 self.assertRaises(TypeError, codecs.getdecoder)
1282 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1283
1284 def test_getreader(self):
1285 self.assertRaises(TypeError, codecs.getreader)
1286 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1287
1288 def test_getwriter(self):
1289 self.assertRaises(TypeError, codecs.getwriter)
1290 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001291
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001292 def test_lookup_issue1813(self):
1293 # Issue #1813: under Turkish locales, lookup of some codecs failed
1294 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitrou2a20f9b2011-07-27 01:06:07 +02001295 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001296 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1297 try:
1298 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1299 except locale.Error:
1300 # Unsupported locale on this system
1301 self.skipTest('test needs Turkish locale')
1302 c = codecs.lookup('ASCII')
1303 self.assertEqual(c.name, 'ascii')
1304
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001305class StreamReaderTest(unittest.TestCase):
1306
1307 def setUp(self):
1308 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001309 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001310
1311 def test_readlines(self):
1312 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001313 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001314
Thomas Wouters89f507f2006-12-13 04:49:30 +00001315class EncodedFileTest(unittest.TestCase):
1316
1317 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001318 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001319 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001320 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001321
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001322 f = io.BytesIO()
Thomas Wouters89f507f2006-12-13 04:49:30 +00001323 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001324 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001325 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001326
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001327all_unicode_encodings = [
1328 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001329 "big5",
1330 "big5hkscs",
1331 "charmap",
1332 "cp037",
1333 "cp1006",
1334 "cp1026",
1335 "cp1140",
1336 "cp1250",
1337 "cp1251",
1338 "cp1252",
1339 "cp1253",
1340 "cp1254",
1341 "cp1255",
1342 "cp1256",
1343 "cp1257",
1344 "cp1258",
1345 "cp424",
1346 "cp437",
1347 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001348 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001349 "cp737",
1350 "cp775",
1351 "cp850",
1352 "cp852",
1353 "cp855",
1354 "cp856",
1355 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001356 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001357 "cp860",
1358 "cp861",
1359 "cp862",
1360 "cp863",
1361 "cp864",
1362 "cp865",
1363 "cp866",
1364 "cp869",
1365 "cp874",
1366 "cp875",
1367 "cp932",
1368 "cp949",
1369 "cp950",
1370 "euc_jis_2004",
1371 "euc_jisx0213",
1372 "euc_jp",
1373 "euc_kr",
1374 "gb18030",
1375 "gb2312",
1376 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001377 "hp_roman8",
1378 "hz",
1379 "idna",
1380 "iso2022_jp",
1381 "iso2022_jp_1",
1382 "iso2022_jp_2",
1383 "iso2022_jp_2004",
1384 "iso2022_jp_3",
1385 "iso2022_jp_ext",
1386 "iso2022_kr",
1387 "iso8859_1",
1388 "iso8859_10",
1389 "iso8859_11",
1390 "iso8859_13",
1391 "iso8859_14",
1392 "iso8859_15",
1393 "iso8859_16",
1394 "iso8859_2",
1395 "iso8859_3",
1396 "iso8859_4",
1397 "iso8859_5",
1398 "iso8859_6",
1399 "iso8859_7",
1400 "iso8859_8",
1401 "iso8859_9",
1402 "johab",
1403 "koi8_r",
1404 "koi8_u",
1405 "latin_1",
1406 "mac_cyrillic",
1407 "mac_greek",
1408 "mac_iceland",
1409 "mac_latin2",
1410 "mac_roman",
1411 "mac_turkish",
1412 "palmos",
1413 "ptcp154",
1414 "punycode",
1415 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001416 "shift_jis",
1417 "shift_jis_2004",
1418 "shift_jisx0213",
1419 "tis_620",
1420 "unicode_escape",
1421 "unicode_internal",
1422 "utf_16",
1423 "utf_16_be",
1424 "utf_16_le",
1425 "utf_7",
1426 "utf_8",
1427]
1428
1429if hasattr(codecs, "mbcs_encode"):
1430 all_unicode_encodings.append("mbcs")
1431
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001432# The following encoding is not tested, because it's not supposed
1433# to work:
1434# "undefined"
1435
1436# The following encodings don't work in stateful mode
1437broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001438 "punycode",
1439 "unicode_internal"
1440]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001441broken_incremental_coders = broken_unicode_with_streams + [
1442 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001443]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001444
Walter Dörwald3abcb012007-04-16 22:10:50 +00001445class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001446 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001447 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001448 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001449 name = codecs.lookup(encoding).name
1450 if encoding.endswith("_codec"):
1451 name += "_codec"
1452 elif encoding == "latin_1":
1453 name = "latin_1"
1454 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001455 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001456 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001457 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001458 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1459
1460 if encoding not in broken_unicode_with_streams:
1461 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001462 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001463 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001464 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001465 for c in s:
1466 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001467 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001468 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001469 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001470 q = Queue(b"")
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001471 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001472 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001473 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001474 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001475 decodedresult += reader.read()
1476 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1477
Thomas Wouters89f507f2006-12-13 04:49:30 +00001478 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001479 # check incremental decoder/encoder (fetched via the Python
1480 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001481 try:
1482 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001483 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001484 except LookupError: # no IncrementalEncoder
1485 pass
1486 else:
1487 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001488 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001489 for c in s:
1490 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001491 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001492 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001493 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001494 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001495 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001496 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001497 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1498
1499 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001500 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001501 for c in s:
1502 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001503 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001504 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001505 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001506 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001507 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001508 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001509 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1510
1511 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001512 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001513 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1514
1515 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001516 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1517 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001518
Victor Stinner554f3f02010-06-16 23:33:54 +00001519 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001520 # check incremental decoder/encoder with errors argument
1521 try:
1522 encoder = codecs.getincrementalencoder(encoding)("ignore")
1523 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1524 except LookupError: # no IncrementalEncoder
1525 pass
1526 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001527 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001528 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001529 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001530 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1531
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001532 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001533 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001534 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001535 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1536
Walter Dörwald729c31f2005-03-14 19:06:30 +00001537 def test_seek(self):
1538 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001539 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001540 for encoding in all_unicode_encodings:
1541 if encoding == "idna": # FIXME: See SF bug #1163178
1542 continue
1543 if encoding in broken_unicode_with_streams:
1544 continue
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001545 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001546 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001547 # Test that calling seek resets the internal codec state and buffers
1548 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001549 data = reader.read()
1550 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001551
Walter Dörwalde22d3392005-11-17 08:52:34 +00001552 def test_bad_decode_args(self):
1553 for encoding in all_unicode_encodings:
1554 decoder = codecs.getdecoder(encoding)
1555 self.assertRaises(TypeError, decoder)
1556 if encoding not in ("idna", "punycode"):
1557 self.assertRaises(TypeError, decoder, 42)
1558
1559 def test_bad_encode_args(self):
1560 for encoding in all_unicode_encodings:
1561 encoder = codecs.getencoder(encoding)
1562 self.assertRaises(TypeError, encoder)
1563
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001564 def test_encoding_map_type_initialized(self):
1565 from encodings import cp1140
1566 # This used to crash, we are only verifying there's no crash.
1567 table_type = type(cp1140.encoding_table)
1568 self.assertEqual(table_type, table_type)
1569
Walter Dörwald3abcb012007-04-16 22:10:50 +00001570 def test_decoder_state(self):
1571 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001573 for encoding in all_unicode_encodings:
1574 if encoding not in broken_incremental_coders:
1575 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1576 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1577
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001578class CharmapTest(unittest.TestCase):
1579 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001580 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001581 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001582 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001583 )
1584
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001585 self.assertRaises(UnicodeDecodeError,
1586 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1587 )
1588
Ezio Melottib3aedd42010-11-20 19:04:17 +00001589 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001590 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001591 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001592 )
1593
Ezio Melottib3aedd42010-11-20 19:04:17 +00001594 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001595 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001596 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001597 )
1598
Ezio Melottib3aedd42010-11-20 19:04:17 +00001599 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001600 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001601 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001602 )
1603
Ezio Melottib3aedd42010-11-20 19:04:17 +00001604 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001605 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001606 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001607 )
1608
Guido van Rossum805365e2007-05-07 22:24:25 +00001609 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001610 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001611 codecs.charmap_decode(allbytes, "ignore", ""),
1612 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001613 )
1614
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001615 def test_decode_with_int2str_map(self):
1616 self.assertEqual(
1617 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1618 {0: 'a', 1: 'b', 2: 'c'}),
1619 ("abc", 3)
1620 )
1621
1622 self.assertEqual(
1623 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1624 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1625 ("AaBbCc", 3)
1626 )
1627
1628 self.assertEqual(
1629 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1630 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1631 ("\U0010FFFFbc", 3)
1632 )
1633
1634 self.assertEqual(
1635 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1636 {0: 'a', 1: 'b', 2: ''}),
1637 ("ab", 3)
1638 )
1639
1640 self.assertRaises(UnicodeDecodeError,
1641 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1642 {0: 'a', 1: 'b'}
1643 )
1644
1645 self.assertEqual(
1646 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1647 {0: 'a', 1: 'b'}),
1648 ("ab\ufffd", 3)
1649 )
1650
1651 self.assertEqual(
1652 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1653 {0: 'a', 1: 'b', 2: None}),
1654 ("ab\ufffd", 3)
1655 )
1656
1657 self.assertEqual(
1658 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1659 {0: 'a', 1: 'b'}),
1660 ("ab", 3)
1661 )
1662
1663 self.assertEqual(
1664 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1665 {0: 'a', 1: 'b', 2: None}),
1666 ("ab", 3)
1667 )
1668
1669 allbytes = bytes(range(256))
1670 self.assertEqual(
1671 codecs.charmap_decode(allbytes, "ignore", {}),
1672 ("", len(allbytes))
1673 )
1674
1675 def test_decode_with_int2int_map(self):
1676 a = ord('a')
1677 b = ord('b')
1678 c = ord('c')
1679
1680 self.assertEqual(
1681 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1682 {0: a, 1: b, 2: c}),
1683 ("abc", 3)
1684 )
1685
1686 # Issue #15379
1687 self.assertEqual(
1688 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1689 {0: 0x10FFFF, 1: b, 2: c}),
1690 ("\U0010FFFFbc", 3)
1691 )
1692
1693 self.assertRaises(TypeError,
1694 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1695 {0: 0x110000, 1: b, 2: c}
1696 )
1697
1698 self.assertRaises(UnicodeDecodeError,
1699 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1700 {0: a, 1: b},
1701 )
1702
1703 self.assertEqual(
1704 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1705 {0: a, 1: b}),
1706 ("ab\ufffd", 3)
1707 )
1708
1709 self.assertEqual(
1710 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1711 {0: a, 1: b}),
1712 ("ab", 3)
1713 )
1714
1715
Thomas Wouters89f507f2006-12-13 04:49:30 +00001716class WithStmtTest(unittest.TestCase):
1717 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001718 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001719 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001720 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001721
1722 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001723 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001724 info = codecs.lookup("utf-8")
1725 with codecs.StreamReaderWriter(f, info.streamreader,
1726 info.streamwriter, 'strict') as srw:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001727 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001728
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001729class TypesTest(unittest.TestCase):
1730 def test_decode_unicode(self):
1731 # Most decoders don't accept unicode input
1732 decoders = [
1733 codecs.utf_7_decode,
1734 codecs.utf_8_decode,
1735 codecs.utf_16_le_decode,
1736 codecs.utf_16_be_decode,
1737 codecs.utf_16_ex_decode,
1738 codecs.utf_32_decode,
1739 codecs.utf_32_le_decode,
1740 codecs.utf_32_be_decode,
1741 codecs.utf_32_ex_decode,
1742 codecs.latin_1_decode,
1743 codecs.ascii_decode,
1744 codecs.charmap_decode,
1745 ]
1746 if hasattr(codecs, "mbcs_decode"):
1747 decoders.append(codecs.mbcs_decode)
1748 for decoder in decoders:
1749 self.assertRaises(TypeError, decoder, "xxx")
1750
1751 def test_unicode_escape(self):
1752 # Escape-decoding an unicode string is supported ang gives the same
1753 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001754 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1755 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1756 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1757 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001758
Martin v. Löwis43c57782009-05-10 08:15:24 +00001759class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001760
1761 def test_utf8(self):
1762 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001763 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001764 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001765 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001766 b"foo\x80bar")
1767 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001768 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001769 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001770 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001771 b"\xed\xb0\x80")
1772
1773 def test_ascii(self):
1774 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001775 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001776 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001777 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001778 b"foo\x80bar")
1779
1780 def test_charmap(self):
1781 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001782 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001783 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001784 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001785 b"foo\xa5bar")
1786
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001787 def test_latin1(self):
1788 # Issue6373
1789 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin1", "surrogateescape"),
1790 b"\xe4\xeb\xef\xf6\xfc")
1791
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001792
Victor Stinner3fed0872010-05-22 02:16:27 +00001793class BomTest(unittest.TestCase):
1794 def test_seek0(self):
1795 data = "1234567890"
1796 tests = ("utf-16",
1797 "utf-16-le",
1798 "utf-16-be",
1799 "utf-32",
1800 "utf-32-le",
1801 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001802 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001803 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001804 # Check if the BOM is written only once
1805 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001806 f.write(data)
1807 f.write(data)
1808 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001809 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001810 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001811 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001812
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001813 # Check that the BOM is written after a seek(0)
1814 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1815 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001816 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001817 f.seek(0)
1818 f.write(data)
1819 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001820 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001821
1822 # (StreamWriter) Check that the BOM is written after a seek(0)
1823 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1824 f.writer.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001825 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001826 f.writer.seek(0)
1827 f.writer.write(data)
1828 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001829 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001830
1831 # Check that the BOM is not written after a seek() at a position
1832 # different than the start
1833 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1834 f.write(data)
1835 f.seek(f.tell())
1836 f.write(data)
1837 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001838 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001839
1840 # (StreamWriter) Check that the BOM is not written after a seek()
1841 # at a position different than the start
1842 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1843 f.writer.write(data)
1844 f.writer.seek(f.writer.tell())
1845 f.writer.write(data)
1846 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001847 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001848
Victor Stinner3fed0872010-05-22 02:16:27 +00001849
Georg Brandl02524622010-12-02 18:06:51 +00001850bytes_transform_encodings = [
1851 "base64_codec",
1852 "uu_codec",
1853 "quopri_codec",
1854 "hex_codec",
1855]
1856try:
1857 import zlib
1858except ImportError:
1859 pass
1860else:
1861 bytes_transform_encodings.append("zlib_codec")
1862try:
1863 import bz2
1864except ImportError:
1865 pass
1866else:
1867 bytes_transform_encodings.append("bz2_codec")
1868
1869class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001870
Georg Brandl02524622010-12-02 18:06:51 +00001871 def test_basics(self):
1872 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001873 for encoding in bytes_transform_encodings:
1874 # generic codecs interface
1875 (o, size) = codecs.getencoder(encoding)(binput)
1876 self.assertEqual(size, len(binput))
1877 (i, size) = codecs.getdecoder(encoding)(o)
1878 self.assertEqual(size, len(o))
1879 self.assertEqual(i, binput)
1880
Georg Brandl02524622010-12-02 18:06:51 +00001881 def test_read(self):
1882 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001883 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001884 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1885 sout = reader.read()
1886 self.assertEqual(sout, b"\x80")
1887
1888 def test_readline(self):
1889 for encoding in bytes_transform_encodings:
1890 if encoding in ['uu_codec', 'zlib_codec']:
1891 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001892 sin = codecs.encode(b"\x80", encoding)
Georg Brandl02524622010-12-02 18:06:51 +00001893 reader = codecs.getreader(encoding)(io.BytesIO(sin))
1894 sout = reader.readline()
1895 self.assertEqual(sout, b"\x80")
1896
1897
Fred Drake2e2be372001-09-20 21:33:42 +00001898def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001899 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001900 UTF32Test,
1901 UTF32LETest,
1902 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001903 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001904 UTF16LETest,
1905 UTF16BETest,
1906 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001907 UTF8SigTest,
Ezio Melotti26ed2342013-01-11 05:54:57 +02001908 EscapeDecodeTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001909 UTF7Test,
1910 UTF16ExTest,
1911 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001912 RecodingTest,
1913 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001914 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001915 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001916 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001917 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001918 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001919 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001920 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001921 CharmapTest,
1922 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001923 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001924 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001925 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001926 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001927 )
Fred Drake2e2be372001-09-20 21:33:42 +00001928
1929
1930if __name__ == "__main__":
1931 test_main()