blob: 8328a2209bf73b4430f63b68d82dfe3f3371f768 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
77class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Walter Dörwald41980ca2007-08-16 21:55:45 +0000297class UTF32Test(ReadTest):
298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200333 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200355 "\x00\xff\u0100\uffff",
356 "\x00\xff\u0100\uffff",
357 "\x00\xff\u0100\uffff",
358 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000359 ]
360 )
361
Georg Brandl791f4e12009-09-17 11:41:24 +0000362 def test_handlers(self):
363 self.assertEqual(('\ufffd', 1),
364 codecs.utf_32_decode(b'\x01', 'replace', True))
365 self.assertEqual(('', 1),
366 codecs.utf_32_decode(b'\x01', 'ignore', True))
367
Walter Dörwald41980ca2007-08-16 21:55:45 +0000368 def test_errors(self):
369 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
370 b"\xff", "strict", True)
371
372 def test_decoder_state(self):
373 self.check_state_handling_decode(self.encoding,
374 "spamspam", self.spamle)
375 self.check_state_handling_decode(self.encoding,
376 "spamspam", self.spambe)
377
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000378 def test_issue8941(self):
379 # Issue #8941: insufficient result allocation when decoding into
380 # surrogate pairs on UCS-2 builds.
381 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
382 self.assertEqual('\U00010000' * 1024,
383 codecs.utf_32_decode(encoded_le)[0])
384 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
385 self.assertEqual('\U00010000' * 1024,
386 codecs.utf_32_decode(encoded_be)[0])
387
Walter Dörwald41980ca2007-08-16 21:55:45 +0000388class UTF32LETest(ReadTest):
389 encoding = "utf-32-le"
390
391 def test_partial(self):
392 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200393 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000394 [
395 "",
396 "",
397 "",
398 "\x00",
399 "\x00",
400 "\x00",
401 "\x00",
402 "\x00\xff",
403 "\x00\xff",
404 "\x00\xff",
405 "\x00\xff",
406 "\x00\xff\u0100",
407 "\x00\xff\u0100",
408 "\x00\xff\u0100",
409 "\x00\xff\u0100",
410 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200411 "\x00\xff\u0100\uffff",
412 "\x00\xff\u0100\uffff",
413 "\x00\xff\u0100\uffff",
414 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000415 ]
416 )
417
418 def test_simple(self):
419 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
420
421 def test_errors(self):
422 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
423 b"\xff", "strict", True)
424
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000425 def test_issue8941(self):
426 # Issue #8941: insufficient result allocation when decoding into
427 # surrogate pairs on UCS-2 builds.
428 encoded = b'\x00\x00\x01\x00' * 1024
429 self.assertEqual('\U00010000' * 1024,
430 codecs.utf_32_le_decode(encoded)[0])
431
Walter Dörwald41980ca2007-08-16 21:55:45 +0000432class UTF32BETest(ReadTest):
433 encoding = "utf-32-be"
434
435 def test_partial(self):
436 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200437 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000438 [
439 "",
440 "",
441 "",
442 "\x00",
443 "\x00",
444 "\x00",
445 "\x00",
446 "\x00\xff",
447 "\x00\xff",
448 "\x00\xff",
449 "\x00\xff",
450 "\x00\xff\u0100",
451 "\x00\xff\u0100",
452 "\x00\xff\u0100",
453 "\x00\xff\u0100",
454 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200455 "\x00\xff\u0100\uffff",
456 "\x00\xff\u0100\uffff",
457 "\x00\xff\u0100\uffff",
458 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald41980ca2007-08-16 21:55:45 +0000459 ]
460 )
461
462 def test_simple(self):
463 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
464
465 def test_errors(self):
466 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
467 b"\xff", "strict", True)
468
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000469 def test_issue8941(self):
470 # Issue #8941: insufficient result allocation when decoding into
471 # surrogate pairs on UCS-2 builds.
472 encoded = b'\x00\x01\x00\x00' * 1024
473 self.assertEqual('\U00010000' * 1024,
474 codecs.utf_32_be_decode(encoded)[0])
475
476
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477class UTF16Test(ReadTest):
478 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000479
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000480 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
481 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000482
483 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000484 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000486 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200487 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000488 f.write("spam")
489 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490 d = s.getvalue()
491 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000492 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000493 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000494 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200495 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000496 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000497
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000499 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200500 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000501 self.assertRaises(UnicodeError, f.read)
502
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000503 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200504 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000505 self.assertRaises(UnicodeError, f.read)
506
Walter Dörwald69652032004-09-07 20:24:22 +0000507 def test_partial(self):
508 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200509 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000510 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000511 "", # first byte of BOM read
512 "", # second byte of BOM read => byteorder known
513 "",
514 "\x00",
515 "\x00",
516 "\x00\xff",
517 "\x00\xff",
518 "\x00\xff\u0100",
519 "\x00\xff\u0100",
520 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200521 "\x00\xff\u0100\uffff",
522 "\x00\xff\u0100\uffff",
523 "\x00\xff\u0100\uffff",
524 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000525 ]
526 )
527
Georg Brandl791f4e12009-09-17 11:41:24 +0000528 def test_handlers(self):
529 self.assertEqual(('\ufffd', 1),
530 codecs.utf_16_decode(b'\x01', 'replace', True))
531 self.assertEqual(('', 1),
532 codecs.utf_16_decode(b'\x01', 'ignore', True))
533
Walter Dörwalde22d3392005-11-17 08:52:34 +0000534 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000535 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000536 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000537
538 def test_decoder_state(self):
539 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000540 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000541 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000543
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000544 def test_bug691291(self):
545 # Files are always opened in binary mode, even if no binary mode was
546 # specified. This means that no automatic conversion of '\n' is done
547 # on reading and writing.
548 s1 = 'Hello\r\nworld\r\n'
549
550 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200551 self.addCleanup(support.unlink, support.TESTFN)
552 with open(support.TESTFN, 'wb') as fp:
553 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200554 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200555 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000556
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000557class UTF16LETest(ReadTest):
558 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000559
560 def test_partial(self):
561 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200562 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000563 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000564 "",
565 "\x00",
566 "\x00",
567 "\x00\xff",
568 "\x00\xff",
569 "\x00\xff\u0100",
570 "\x00\xff\u0100",
571 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200572 "\x00\xff\u0100\uffff",
573 "\x00\xff\u0100\uffff",
574 "\x00\xff\u0100\uffff",
575 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000576 ]
577 )
578
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200580 tests = [
581 (b'\xff', '\ufffd'),
582 (b'A\x00Z', 'A\ufffd'),
583 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
584 (b'\x00\xd8', '\ufffd'),
585 (b'\x00\xd8A', '\ufffd'),
586 (b'\x00\xd8A\x00', '\ufffdA'),
587 (b'\x00\xdcA\x00', '\ufffdA'),
588 ]
589 for raw, expected in tests:
590 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
591 raw, 'strict', True)
592 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000593
Victor Stinner53a9dd72010-12-08 22:25:45 +0000594 def test_nonbmp(self):
595 self.assertEqual("\U00010203".encode(self.encoding),
596 b'\x00\xd8\x03\xde')
597 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
598 "\U00010203")
599
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000600class UTF16BETest(ReadTest):
601 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000602
603 def test_partial(self):
604 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200605 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000606 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000607 "",
608 "\x00",
609 "\x00",
610 "\x00\xff",
611 "\x00\xff",
612 "\x00\xff\u0100",
613 "\x00\xff\u0100",
614 "\x00\xff\u0100\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200615 "\x00\xff\u0100\uffff",
616 "\x00\xff\u0100\uffff",
617 "\x00\xff\u0100\uffff",
618 "\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000619 ]
620 )
621
Walter Dörwalde22d3392005-11-17 08:52:34 +0000622 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200623 tests = [
624 (b'\xff', '\ufffd'),
625 (b'\x00A\xff', 'A\ufffd'),
626 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
627 (b'\xd8\x00', '\ufffd'),
628 (b'\xd8\x00\xdc', '\ufffd'),
629 (b'\xd8\x00\x00A', '\ufffdA'),
630 (b'\xdc\x00\x00A', '\ufffdA'),
631 ]
632 for raw, expected in tests:
633 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
634 raw, 'strict', True)
635 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000636
Victor Stinner53a9dd72010-12-08 22:25:45 +0000637 def test_nonbmp(self):
638 self.assertEqual("\U00010203".encode(self.encoding),
639 b'\xd8\x00\xde\x03')
640 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
641 "\U00010203")
642
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000643class UTF8Test(ReadTest):
644 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000645
646 def test_partial(self):
647 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200648 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000649 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000650 "\x00",
651 "\x00",
652 "\x00\xff",
653 "\x00\xff",
654 "\x00\xff\u07ff",
655 "\x00\xff\u07ff",
656 "\x00\xff\u07ff",
657 "\x00\xff\u07ff\u0800",
658 "\x00\xff\u07ff\u0800",
659 "\x00\xff\u07ff\u0800",
660 "\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200661 "\x00\xff\u07ff\u0800\uffff",
662 "\x00\xff\u07ff\u0800\uffff",
663 "\x00\xff\u07ff\u0800\uffff",
664 "\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000665 ]
666 )
667
Walter Dörwald3abcb012007-04-16 22:10:50 +0000668 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000669 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000670 self.check_state_handling_decode(self.encoding,
671 u, u.encode(self.encoding))
672
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000673 def test_lone_surrogates(self):
674 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
675 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000676 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
677 b'[\\udc80]')
678 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
679 b'[&#56448;]')
680 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
681 b'[\x80]')
682 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
683 b'[]')
684 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
685 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000686
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000687 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000688 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
689 b"abc\xed\xa0\x80def")
690 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
691 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
693 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
694 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
695 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000696 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700697 with self.assertRaises(UnicodeDecodeError):
698 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200699 with self.assertRaises(UnicodeDecodeError):
700 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000701
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200702@unittest.skipUnless(sys.platform == 'win32',
703 'cp65001 is a Windows-only codec')
704class CP65001Test(ReadTest):
705 encoding = "cp65001"
706
707 def test_encode(self):
708 tests = [
709 ('abc', 'strict', b'abc'),
710 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
711 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
712 ]
713 if VISTA_OR_LATER:
714 tests.extend((
715 ('\udc80', 'strict', None),
716 ('\udc80', 'ignore', b''),
717 ('\udc80', 'replace', b'?'),
718 ('\udc80', 'backslashreplace', b'\\udc80'),
719 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
720 ))
721 else:
722 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
723 for text, errors, expected in tests:
724 if expected is not None:
725 try:
726 encoded = text.encode('cp65001', errors)
727 except UnicodeEncodeError as err:
728 self.fail('Unable to encode %a to cp65001 with '
729 'errors=%r: %s' % (text, errors, err))
730 self.assertEqual(encoded, expected,
731 '%a.encode("cp65001", %r)=%a != %a'
732 % (text, errors, encoded, expected))
733 else:
734 self.assertRaises(UnicodeEncodeError,
735 text.encode, "cp65001", errors)
736
737 def test_decode(self):
738 tests = [
739 (b'abc', 'strict', 'abc'),
740 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
741 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
742 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
743 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
744 # invalid bytes
745 (b'[\xff]', 'strict', None),
746 (b'[\xff]', 'ignore', '[]'),
747 (b'[\xff]', 'replace', '[\ufffd]'),
748 (b'[\xff]', 'surrogateescape', '[\udcff]'),
749 ]
750 if VISTA_OR_LATER:
751 tests.extend((
752 (b'[\xed\xb2\x80]', 'strict', None),
753 (b'[\xed\xb2\x80]', 'ignore', '[]'),
754 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
755 ))
756 else:
757 tests.extend((
758 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
759 ))
760 for raw, errors, expected in tests:
761 if expected is not None:
762 try:
763 decoded = raw.decode('cp65001', errors)
764 except UnicodeDecodeError as err:
765 self.fail('Unable to decode %a from cp65001 with '
766 'errors=%r: %s' % (raw, errors, err))
767 self.assertEqual(decoded, expected,
768 '%a.decode("cp65001", %r)=%a != %a'
769 % (raw, errors, decoded, expected))
770 else:
771 self.assertRaises(UnicodeDecodeError,
772 raw.decode, 'cp65001', errors)
773
774 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
775 def test_lone_surrogates(self):
776 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
777 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
778 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
779 b'[\\udc80]')
780 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
781 b'[&#56448;]')
782 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
783 b'[\x80]')
784 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
785 b'[]')
786 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
787 b'[?]')
788
789 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
790 def test_surrogatepass_handler(self):
791 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
792 b"abc\xed\xa0\x80def")
793 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
794 "abc\ud800def")
795 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
796 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
797 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
798 "\U00010fff\uD800")
799 self.assertTrue(codecs.lookup_error("surrogatepass"))
800
801
802
Walter Dörwalde22d3392005-11-17 08:52:34 +0000803class UTF7Test(ReadTest):
804 encoding = "utf-7"
805
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000806 def test_partial(self):
807 self.check_partial(
808 "a+-b",
809 [
810 "a",
811 "a",
812 "a+",
813 "a+-",
814 "a+-b",
815 ]
816 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000817
818class UTF16ExTest(unittest.TestCase):
819
820 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000821 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000822
823 def test_bad_args(self):
824 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
825
826class ReadBufferTest(unittest.TestCase):
827
828 def test_array(self):
829 import array
830 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000831 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000832 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000833 )
834
835 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000836 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000837
838 def test_bad_args(self):
839 self.assertRaises(TypeError, codecs.readbuffer_encode)
840 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
841
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000842class UTF8SigTest(ReadTest):
843 encoding = "utf-8-sig"
844
845 def test_partial(self):
846 self.check_partial(
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200847 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000848 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000849 "",
850 "",
851 "", # First BOM has been read and skipped
852 "",
853 "",
854 "\ufeff", # Second BOM has been read and emitted
855 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000856 "\ufeff\x00", # First byte of encoded "\xff" read
857 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
858 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
859 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000860 "\ufeff\x00\xff\u07ff",
861 "\ufeff\x00\xff\u07ff",
862 "\ufeff\x00\xff\u07ff\u0800",
863 "\ufeff\x00\xff\u07ff\u0800",
864 "\ufeff\x00\xff\u07ff\u0800",
865 "\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchaka48e188e2013-01-08 23:14:24 +0200866 "\ufeff\x00\xff\u07ff\u0800\uffff",
867 "\ufeff\x00\xff\u07ff\u0800\uffff",
868 "\ufeff\x00\xff\u07ff\u0800\uffff",
869 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000870 ]
871 )
872
Thomas Wouters89f507f2006-12-13 04:49:30 +0000873 def test_bug1601501(self):
874 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000875 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000876
Walter Dörwald3abcb012007-04-16 22:10:50 +0000877 def test_bom(self):
878 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000879 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000880 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
881
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000882 def test_stream_bom(self):
883 unistring = "ABC\u00A1\u2200XYZ"
884 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
885
886 reader = codecs.getreader("utf-8-sig")
887 for sizehint in [None] + list(range(1, 11)) + \
888 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200889 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000890 ostream = io.StringIO()
891 while 1:
892 if sizehint is not None:
893 data = istream.read(sizehint)
894 else:
895 data = istream.read()
896
897 if not data:
898 break
899 ostream.write(data)
900
901 got = ostream.getvalue()
902 self.assertEqual(got, unistring)
903
904 def test_stream_bare(self):
905 unistring = "ABC\u00A1\u2200XYZ"
906 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
907
908 reader = codecs.getreader("utf-8-sig")
909 for sizehint in [None] + list(range(1, 11)) + \
910 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200911 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000912 ostream = io.StringIO()
913 while 1:
914 if sizehint is not None:
915 data = istream.read(sizehint)
916 else:
917 data = istream.read()
918
919 if not data:
920 break
921 ostream.write(data)
922
923 got = ostream.getvalue()
924 self.assertEqual(got, unistring)
925
926class EscapeDecodeTest(unittest.TestCase):
927 def test_empty(self):
Ezio Melotti26ed2342013-01-11 05:54:57 +0200928 self.assertEqual(codecs.escape_decode(""), (b"", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000929
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000930class RecodingTest(unittest.TestCase):
931 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000932 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200933 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000934 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000935 f2.close()
936 # Python used to crash on this at exit because of a refcount
937 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000938
Martin v. Löwis2548c732003-04-18 10:39:54 +0000939# From RFC 3492
940punycode_testcases = [
941 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000942 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
943 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000944 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000946 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000947 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000948 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000949 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000950 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
953 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
954 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000955 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000956 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000957 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
958 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
959 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000960 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000962 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000963 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
964 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
965 "\u0939\u0948\u0902",
966 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967
968 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000969 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000970 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
971 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972
973 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000974 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
975 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
976 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000977 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
978 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979
980 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000981 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
982 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
983 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
984 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000985 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000986
987 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000988 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
989 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
990 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
991 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
992 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000993 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994
995 # (K) Vietnamese:
996 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
997 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
999 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1000 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1001 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001002 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001005 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001006 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +00001007
Martin v. Löwis2548c732003-04-18 10:39:54 +00001008 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001009 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1010 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1011 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001012 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001013
1014 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001015 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1016 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1017 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001018 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001019
1020 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001021 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001022 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023
1024 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001025 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1026 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001027 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001028
1029 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001030 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001031 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032
1033 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001034 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001035 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001036
1037 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001038 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1039 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001040 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 ]
1042
1043for i in punycode_testcases:
1044 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001045 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046
1047class PunycodeTest(unittest.TestCase):
1048 def test_encode(self):
1049 for uni, puny in punycode_testcases:
1050 # Need to convert both strings to lower case, since
1051 # some of the extended encodings use upper case, but our
1052 # code produces only lower case. Converting just puny to
1053 # lower is also insufficient, since some of the input characters
1054 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001055 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001056 str(uni.encode("punycode"), "ascii").lower(),
1057 str(puny, "ascii").lower()
1058 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059
1060 def test_decode(self):
1061 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001062 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001063 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001064 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001066class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001067 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001068 def test_bug1251300(self):
1069 # Decoding with unicode_internal used to not correctly handle "code
1070 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001071 ok = [
1072 (b"\x00\x10\xff\xff", "\U0010ffff"),
1073 (b"\x00\x00\x01\x01", "\U00000101"),
1074 (b"", ""),
1075 ]
1076 not_ok = [
1077 b"\x7f\xff\xff\xff",
1078 b"\x80\x00\x00\x00",
1079 b"\x81\x00\x00\x00",
1080 b"\x00",
1081 b"\x00\x00\x00\x00\x00",
1082 ]
1083 for internal, uni in ok:
1084 if sys.byteorder == "little":
1085 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001086 with support.check_warnings():
1087 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001088 for internal in not_ok:
1089 if sys.byteorder == "little":
1090 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001091 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001092 'deprecated', DeprecationWarning)):
1093 self.assertRaises(UnicodeDecodeError, internal.decode,
1094 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001095 if sys.byteorder == "little":
1096 invalid = b"\x00\x00\x11\x00"
1097 else:
1098 invalid = b"\x00\x11\x00\x00"
1099 with support.check_warnings():
1100 self.assertRaises(UnicodeDecodeError,
1101 invalid.decode, "unicode_internal")
1102 with support.check_warnings():
1103 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1104 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001105
Victor Stinner182d90d2011-09-29 19:53:55 +02001106 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001107 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001108 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001109 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001110 'deprecated', DeprecationWarning)):
1111 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001112 except UnicodeDecodeError as ex:
1113 self.assertEqual("unicode_internal", ex.encoding)
1114 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1115 self.assertEqual(4, ex.start)
1116 self.assertEqual(8, ex.end)
1117 else:
1118 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001119
Victor Stinner182d90d2011-09-29 19:53:55 +02001120 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001121 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001122 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1123 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001124 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001125 'deprecated', DeprecationWarning)):
1126 ab = "ab".encode("unicode_internal").decode()
1127 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1128 "ascii"),
1129 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001130 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001131
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001132 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001133 with support.check_warnings(('unicode_internal codec has been '
1134 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001135 # Issue 3739
1136 encoder = codecs.getencoder("unicode_internal")
1137 self.assertEqual(encoder("a")[1], 1)
1138 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1139
1140 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001141
Martin v. Löwis2548c732003-04-18 10:39:54 +00001142# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1143nameprep_tests = [
1144 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001145 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1146 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1147 b'\xb8\x8f\xef\xbb\xbf',
1148 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001150 (b'CAFE',
1151 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 # 3.3 Case folding 8bit U+00DF (german sharp s).
1153 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001154 (b'\xc3\x9f',
1155 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001157 (b'\xc4\xb0',
1158 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001160 (b'\xc5\x83\xcd\xba',
1161 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001162 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1163 # XXX: skip this as it fails in UCS-2 mode
1164 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1165 # 'telc\xe2\x88\x95kg\xcf\x83'),
1166 (None, None),
1167 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1169 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 (b'\xe1\xbe\xb7',
1172 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 # 3.9 Self-reverting case folding U+01F0 and normalization.
1174 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001175 (b'\xc7\xb0',
1176 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001177 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 (b'\xce\x90',
1179 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001180 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001181 (b'\xce\xb0',
1182 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001184 (b'\xe1\xba\x96',
1185 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001186 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001187 (b'\xe1\xbd\x96',
1188 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 (b' ',
1191 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001192 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001193 (b'\xc2\xa0',
1194 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001195 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001196 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197 None),
1198 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001199 (b'\xe2\x80\x80',
1200 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001201 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001202 (b'\xe2\x80\x8b',
1203 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001204 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001205 (b'\xe3\x80\x80',
1206 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001208 (b'\x10\x7f',
1209 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001211 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 None),
1213 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001214 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 None),
1216 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001217 (b'\xef\xbb\xbf',
1218 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001220 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001221 None),
1222 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001223 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 None),
1225 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001226 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 None),
1228 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001229 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 None),
1231 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001232 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233 None),
1234 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001235 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 None),
1237 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001238 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001239 None),
1240 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001241 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242 None),
1243 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001244 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 None),
1246 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001247 (b'\xcd\x81',
1248 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001250 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001251 None),
1252 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001253 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 None),
1255 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001256 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 None),
1258 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001259 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001260 None),
1261 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001262 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001263 None),
1264 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001265 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001266 None),
1267 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001268 (b'foo\xef\xb9\xb6bar',
1269 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001270 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001271 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001272 None),
1273 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001274 (b'\xd8\xa71\xd8\xa8',
1275 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001277 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001278 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001279 # None),
1280 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001281 # 3.44 Larger test (shrinking).
1282 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001283 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1284 b'\xaa\xce\xb0\xe2\x80\x80',
1285 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001286 # 3.45 Larger test (expanding).
1287 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1289 b'\x80',
1290 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1291 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1292 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001293 ]
1294
1295
1296class NameprepTest(unittest.TestCase):
1297 def test_nameprep(self):
1298 from encodings.idna import nameprep
1299 for pos, (orig, prepped) in enumerate(nameprep_tests):
1300 if orig is None:
1301 # Skipped
1302 continue
1303 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001304 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001305 if prepped is None:
1306 # Input contains prohibited characters
1307 self.assertRaises(UnicodeError, nameprep, orig)
1308 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001309 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001310 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001311 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001312 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001313 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001314
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001315class IDNACodecTest(unittest.TestCase):
1316 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001317 self.assertEqual(str(b"python.org", "idna"), "python.org")
1318 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1319 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1320 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001321
1322 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001323 self.assertEqual("python.org".encode("idna"), b"python.org")
1324 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1325 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1326 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001327
Martin v. Löwis8b595142005-08-25 11:03:38 +00001328 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001329 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001330 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001331 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001332
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001333 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001334 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001335 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001336 "python.org"
1337 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001338 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001339 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001340 "python.org."
1341 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001342 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001343 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001344 "pyth\xf6n.org."
1345 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001346 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001347 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001348 "pyth\xf6n.org."
1349 )
1350
1351 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001352 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1353 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1354 self.assertEqual(decoder.decode(b"rg"), "")
1355 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001356
1357 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001358 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1359 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1360 self.assertEqual(decoder.decode(b"rg."), "org.")
1361 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001362
1363 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001364 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001365 b"".join(codecs.iterencode("python.org", "idna")),
1366 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001367 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001368 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001369 b"".join(codecs.iterencode("python.org.", "idna")),
1370 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001371 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001372 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001373 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1374 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001375 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001376 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001377 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1378 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001379 )
1380
1381 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001382 self.assertEqual(encoder.encode("\xe4x"), b"")
1383 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1384 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001385
1386 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001387 self.assertEqual(encoder.encode("\xe4x"), b"")
1388 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1389 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001390
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001391class CodecsModuleTest(unittest.TestCase):
1392
1393 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001394 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1395 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001396 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001397 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001398 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001399
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001400 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001401 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1402 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001403 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001404 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001405 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001406 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001407
1408 def test_register(self):
1409 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001410 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001411
1412 def test_lookup(self):
1413 self.assertRaises(TypeError, codecs.lookup)
1414 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001415 self.assertRaises(LookupError, codecs.lookup, " ")
1416
1417 def test_getencoder(self):
1418 self.assertRaises(TypeError, codecs.getencoder)
1419 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1420
1421 def test_getdecoder(self):
1422 self.assertRaises(TypeError, codecs.getdecoder)
1423 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1424
1425 def test_getreader(self):
1426 self.assertRaises(TypeError, codecs.getreader)
1427 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1428
1429 def test_getwriter(self):
1430 self.assertRaises(TypeError, codecs.getwriter)
1431 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001432
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001433 def test_lookup_issue1813(self):
1434 # Issue #1813: under Turkish locales, lookup of some codecs failed
1435 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001436 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001437 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1438 try:
1439 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1440 except locale.Error:
1441 # Unsupported locale on this system
1442 self.skipTest('test needs Turkish locale')
1443 c = codecs.lookup('ASCII')
1444 self.assertEqual(c.name, 'ascii')
1445
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001446class StreamReaderTest(unittest.TestCase):
1447
1448 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001449 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001450 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001451
1452 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001453 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001454 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001455
Thomas Wouters89f507f2006-12-13 04:49:30 +00001456class EncodedFileTest(unittest.TestCase):
1457
1458 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001459 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001460 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001461 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001462
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001463 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001464 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001465 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001466 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001467
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001468all_unicode_encodings = [
1469 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001470 "big5",
1471 "big5hkscs",
1472 "charmap",
1473 "cp037",
1474 "cp1006",
1475 "cp1026",
1476 "cp1140",
1477 "cp1250",
1478 "cp1251",
1479 "cp1252",
1480 "cp1253",
1481 "cp1254",
1482 "cp1255",
1483 "cp1256",
1484 "cp1257",
1485 "cp1258",
1486 "cp424",
1487 "cp437",
1488 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001489 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001490 "cp737",
1491 "cp775",
1492 "cp850",
1493 "cp852",
1494 "cp855",
1495 "cp856",
1496 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001497 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001498 "cp860",
1499 "cp861",
1500 "cp862",
1501 "cp863",
1502 "cp864",
1503 "cp865",
1504 "cp866",
1505 "cp869",
1506 "cp874",
1507 "cp875",
1508 "cp932",
1509 "cp949",
1510 "cp950",
1511 "euc_jis_2004",
1512 "euc_jisx0213",
1513 "euc_jp",
1514 "euc_kr",
1515 "gb18030",
1516 "gb2312",
1517 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001518 "hp_roman8",
1519 "hz",
1520 "idna",
1521 "iso2022_jp",
1522 "iso2022_jp_1",
1523 "iso2022_jp_2",
1524 "iso2022_jp_2004",
1525 "iso2022_jp_3",
1526 "iso2022_jp_ext",
1527 "iso2022_kr",
1528 "iso8859_1",
1529 "iso8859_10",
1530 "iso8859_11",
1531 "iso8859_13",
1532 "iso8859_14",
1533 "iso8859_15",
1534 "iso8859_16",
1535 "iso8859_2",
1536 "iso8859_3",
1537 "iso8859_4",
1538 "iso8859_5",
1539 "iso8859_6",
1540 "iso8859_7",
1541 "iso8859_8",
1542 "iso8859_9",
1543 "johab",
1544 "koi8_r",
1545 "koi8_u",
1546 "latin_1",
1547 "mac_cyrillic",
1548 "mac_greek",
1549 "mac_iceland",
1550 "mac_latin2",
1551 "mac_roman",
1552 "mac_turkish",
1553 "palmos",
1554 "ptcp154",
1555 "punycode",
1556 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001557 "shift_jis",
1558 "shift_jis_2004",
1559 "shift_jisx0213",
1560 "tis_620",
1561 "unicode_escape",
1562 "unicode_internal",
1563 "utf_16",
1564 "utf_16_be",
1565 "utf_16_le",
1566 "utf_7",
1567 "utf_8",
1568]
1569
1570if hasattr(codecs, "mbcs_encode"):
1571 all_unicode_encodings.append("mbcs")
1572
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001573# The following encoding is not tested, because it's not supposed
1574# to work:
1575# "undefined"
1576
1577# The following encodings don't work in stateful mode
1578broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001579 "punycode",
1580 "unicode_internal"
1581]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001582broken_incremental_coders = broken_unicode_with_streams + [
1583 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001584]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001585
Walter Dörwald3abcb012007-04-16 22:10:50 +00001586class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001587 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001588 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001589 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001590 name = codecs.lookup(encoding).name
1591 if encoding.endswith("_codec"):
1592 name += "_codec"
1593 elif encoding == "latin_1":
1594 name = "latin_1"
1595 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001596
Ezio Melottiadc417c2011-11-17 12:23:34 +02001597 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001598 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001599 (b, size) = codecs.getencoder(encoding)(s)
1600 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1601 (chars, size) = codecs.getdecoder(encoding)(b)
1602 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001603
1604 if encoding not in broken_unicode_with_streams:
1605 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001606 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001607 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001608 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001609 for c in s:
1610 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001611 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001612 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001613 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001614 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001615 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001616 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001617 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001618 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001619 decodedresult += reader.read()
1620 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1621
Thomas Wouters89f507f2006-12-13 04:49:30 +00001622 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001623 # check incremental decoder/encoder (fetched via the Python
1624 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001625 try:
1626 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001627 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001628 except LookupError: # no IncrementalEncoder
1629 pass
1630 else:
1631 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001632 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001633 for c in s:
1634 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001635 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001636 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001637 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001638 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001639 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001640 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001641 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1642
1643 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001644 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001645 for c in s:
1646 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001647 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001648 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001650 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001651 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001652 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001653 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1654
1655 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001656 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001657 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1658
1659 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001660 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1661 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001662
Victor Stinner554f3f02010-06-16 23:33:54 +00001663 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001664 # check incremental decoder/encoder with errors argument
1665 try:
1666 encoder = codecs.getincrementalencoder(encoding)("ignore")
1667 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1668 except LookupError: # no IncrementalEncoder
1669 pass
1670 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001671 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001672 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001673 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001674 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1675
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001676 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001677 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001678 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001679 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1680
Walter Dörwald729c31f2005-03-14 19:06:30 +00001681 def test_seek(self):
1682 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001683 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001684 for encoding in all_unicode_encodings:
1685 if encoding == "idna": # FIXME: See SF bug #1163178
1686 continue
1687 if encoding in broken_unicode_with_streams:
1688 continue
Victor Stinner05010702011-05-27 16:50:40 +02001689 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001690 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001691 # Test that calling seek resets the internal codec state and buffers
1692 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001693 data = reader.read()
1694 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001695
Walter Dörwalde22d3392005-11-17 08:52:34 +00001696 def test_bad_decode_args(self):
1697 for encoding in all_unicode_encodings:
1698 decoder = codecs.getdecoder(encoding)
1699 self.assertRaises(TypeError, decoder)
1700 if encoding not in ("idna", "punycode"):
1701 self.assertRaises(TypeError, decoder, 42)
1702
1703 def test_bad_encode_args(self):
1704 for encoding in all_unicode_encodings:
1705 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001706 with support.check_warnings():
1707 # unicode-internal has been deprecated
1708 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001709
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001710 def test_encoding_map_type_initialized(self):
1711 from encodings import cp1140
1712 # This used to crash, we are only verifying there's no crash.
1713 table_type = type(cp1140.encoding_table)
1714 self.assertEqual(table_type, table_type)
1715
Walter Dörwald3abcb012007-04-16 22:10:50 +00001716 def test_decoder_state(self):
1717 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001718 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001719 for encoding in all_unicode_encodings:
1720 if encoding not in broken_incremental_coders:
1721 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1722 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1723
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001724class CharmapTest(unittest.TestCase):
1725 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001726 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001727 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001728 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001729 )
1730
Ezio Melottib3aedd42010-11-20 19:04:17 +00001731 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001732 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1733 ("\U0010FFFFbc", 3)
1734 )
1735
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001736 self.assertRaises(UnicodeDecodeError,
1737 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1738 )
1739
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001740 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001741 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001742 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001743 )
1744
Ezio Melottib3aedd42010-11-20 19:04:17 +00001745 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001746 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001747 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001748 )
1749
Ezio Melottib3aedd42010-11-20 19:04:17 +00001750 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001751 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001752 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001753 )
1754
Ezio Melottib3aedd42010-11-20 19:04:17 +00001755 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001756 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001757 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001758 )
1759
Guido van Rossum805365e2007-05-07 22:24:25 +00001760 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001761 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001762 codecs.charmap_decode(allbytes, "ignore", ""),
1763 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001764 )
1765
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001766 def test_decode_with_int2str_map(self):
1767 self.assertEqual(
1768 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1769 {0: 'a', 1: 'b', 2: 'c'}),
1770 ("abc", 3)
1771 )
1772
1773 self.assertEqual(
1774 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1775 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1776 ("AaBbCc", 3)
1777 )
1778
1779 self.assertEqual(
1780 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1781 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1782 ("\U0010FFFFbc", 3)
1783 )
1784
1785 self.assertEqual(
1786 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1787 {0: 'a', 1: 'b', 2: ''}),
1788 ("ab", 3)
1789 )
1790
1791 self.assertRaises(UnicodeDecodeError,
1792 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1793 {0: 'a', 1: 'b'}
1794 )
1795
1796 self.assertEqual(
1797 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1798 {0: 'a', 1: 'b'}),
1799 ("ab\ufffd", 3)
1800 )
1801
1802 self.assertEqual(
1803 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1804 {0: 'a', 1: 'b', 2: None}),
1805 ("ab\ufffd", 3)
1806 )
1807
1808 self.assertEqual(
1809 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1810 {0: 'a', 1: 'b'}),
1811 ("ab", 3)
1812 )
1813
1814 self.assertEqual(
1815 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1816 {0: 'a', 1: 'b', 2: None}),
1817 ("ab", 3)
1818 )
1819
1820 allbytes = bytes(range(256))
1821 self.assertEqual(
1822 codecs.charmap_decode(allbytes, "ignore", {}),
1823 ("", len(allbytes))
1824 )
1825
1826 def test_decode_with_int2int_map(self):
1827 a = ord('a')
1828 b = ord('b')
1829 c = ord('c')
1830
1831 self.assertEqual(
1832 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1833 {0: a, 1: b, 2: c}),
1834 ("abc", 3)
1835 )
1836
1837 # Issue #15379
1838 self.assertEqual(
1839 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1840 {0: 0x10FFFF, 1: b, 2: c}),
1841 ("\U0010FFFFbc", 3)
1842 )
1843
Antoine Pitroua1f76552012-09-23 20:00:04 +02001844 self.assertEqual(
1845 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1846 {0: sys.maxunicode, 1: b, 2: c}),
1847 (chr(sys.maxunicode) + "bc", 3)
1848 )
1849
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001850 self.assertRaises(TypeError,
1851 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001852 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001853 )
1854
1855 self.assertRaises(UnicodeDecodeError,
1856 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1857 {0: a, 1: b},
1858 )
1859
1860 self.assertEqual(
1861 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1862 {0: a, 1: b}),
1863 ("ab\ufffd", 3)
1864 )
1865
1866 self.assertEqual(
1867 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1868 {0: a, 1: b}),
1869 ("ab", 3)
1870 )
1871
1872
Thomas Wouters89f507f2006-12-13 04:49:30 +00001873class WithStmtTest(unittest.TestCase):
1874 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001875 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001876 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1877 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001878
1879 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001880 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001881 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001882 with codecs.StreamReaderWriter(f, info.streamreader,
1883 info.streamwriter, 'strict') as srw:
1884 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001885
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001886class TypesTest(unittest.TestCase):
1887 def test_decode_unicode(self):
1888 # Most decoders don't accept unicode input
1889 decoders = [
1890 codecs.utf_7_decode,
1891 codecs.utf_8_decode,
1892 codecs.utf_16_le_decode,
1893 codecs.utf_16_be_decode,
1894 codecs.utf_16_ex_decode,
1895 codecs.utf_32_decode,
1896 codecs.utf_32_le_decode,
1897 codecs.utf_32_be_decode,
1898 codecs.utf_32_ex_decode,
1899 codecs.latin_1_decode,
1900 codecs.ascii_decode,
1901 codecs.charmap_decode,
1902 ]
1903 if hasattr(codecs, "mbcs_decode"):
1904 decoders.append(codecs.mbcs_decode)
1905 for decoder in decoders:
1906 self.assertRaises(TypeError, decoder, "xxx")
1907
1908 def test_unicode_escape(self):
1909 # Escape-decoding an unicode string is supported ang gives the same
1910 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001911 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1912 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1913 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1914 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001915
Victor Stinnere3b47152011-12-09 20:49:49 +01001916 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
1917 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1918
1919 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
1920 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1921
Martin v. Löwis43c57782009-05-10 08:15:24 +00001922class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001923
1924 def test_utf8(self):
1925 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001926 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001927 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001928 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001929 b"foo\x80bar")
1930 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001931 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001932 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001933 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001934 b"\xed\xb0\x80")
1935
1936 def test_ascii(self):
1937 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001938 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001939 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001940 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001941 b"foo\x80bar")
1942
1943 def test_charmap(self):
1944 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001945 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001946 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001947 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001948 b"foo\xa5bar")
1949
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001950 def test_latin1(self):
1951 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001952 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001953 b"\xe4\xeb\xef\xf6\xfc")
1954
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001955
Victor Stinner3fed0872010-05-22 02:16:27 +00001956class BomTest(unittest.TestCase):
1957 def test_seek0(self):
1958 data = "1234567890"
1959 tests = ("utf-16",
1960 "utf-16-le",
1961 "utf-16-be",
1962 "utf-32",
1963 "utf-32-le",
1964 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001965 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001966 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001967 # Check if the BOM is written only once
1968 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001969 f.write(data)
1970 f.write(data)
1971 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001972 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001973 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001974 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001975
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001976 # Check that the BOM is written after a seek(0)
1977 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1978 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001979 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001980 f.seek(0)
1981 f.write(data)
1982 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001983 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001984
1985 # (StreamWriter) Check that the BOM is written after a seek(0)
1986 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001987 f.writer.write(data[0])
1988 self.assertNotEqual(f.writer.tell(), 0)
1989 f.writer.seek(0)
1990 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001991 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001992 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001993
Victor Stinner05010702011-05-27 16:50:40 +02001994 # Check that the BOM is not written after a seek() at a position
1995 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001996 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1997 f.write(data)
1998 f.seek(f.tell())
1999 f.write(data)
2000 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002001 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002002
Victor Stinner05010702011-05-27 16:50:40 +02002003 # (StreamWriter) Check that the BOM is not written after a seek()
2004 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002005 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02002006 f.writer.write(data)
2007 f.writer.seek(f.writer.tell())
2008 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002009 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00002010 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00002011
Victor Stinner3fed0872010-05-22 02:16:27 +00002012
Georg Brandl02524622010-12-02 18:06:51 +00002013bytes_transform_encodings = [
2014 "base64_codec",
2015 "uu_codec",
2016 "quopri_codec",
2017 "hex_codec",
2018]
2019try:
2020 import zlib
2021except ImportError:
2022 pass
2023else:
2024 bytes_transform_encodings.append("zlib_codec")
2025try:
2026 import bz2
2027except ImportError:
2028 pass
2029else:
2030 bytes_transform_encodings.append("bz2_codec")
2031
2032class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002033
Georg Brandl02524622010-12-02 18:06:51 +00002034 def test_basics(self):
2035 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002036 for encoding in bytes_transform_encodings:
2037 # generic codecs interface
2038 (o, size) = codecs.getencoder(encoding)(binput)
2039 self.assertEqual(size, len(binput))
2040 (i, size) = codecs.getdecoder(encoding)(o)
2041 self.assertEqual(size, len(o))
2042 self.assertEqual(i, binput)
2043
Georg Brandl02524622010-12-02 18:06:51 +00002044 def test_read(self):
2045 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002046 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002047 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002048 sout = reader.read()
2049 self.assertEqual(sout, b"\x80")
2050
2051 def test_readline(self):
2052 for encoding in bytes_transform_encodings:
2053 if encoding in ['uu_codec', 'zlib_codec']:
2054 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002055 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002056 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002057 sout = reader.readline()
2058 self.assertEqual(sout, b"\x80")
2059
2060
Victor Stinner62be4fb2011-10-18 21:46:37 +02002061@unittest.skipUnless(sys.platform == 'win32',
2062 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002063class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002064 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002065 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002066
Victor Stinner3a50e702011-10-18 21:21:00 +02002067 def test_invalid_code_page(self):
2068 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2069 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2070 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2071 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2072
2073 def test_code_page_name(self):
2074 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2075 codecs.code_page_encode, 932, '\xff')
2076 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2077 codecs.code_page_decode, 932, b'\x81\x00')
2078 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2079 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2080
2081 def check_decode(self, cp, tests):
2082 for raw, errors, expected in tests:
2083 if expected is not None:
2084 try:
2085 decoded = codecs.code_page_decode(cp, raw, errors)
2086 except UnicodeDecodeError as err:
2087 self.fail('Unable to decode %a from "cp%s" with '
2088 'errors=%r: %s' % (raw, cp, errors, err))
2089 self.assertEqual(decoded[0], expected,
2090 '%a.decode("cp%s", %r)=%a != %a'
2091 % (raw, cp, errors, decoded[0], expected))
2092 # assert 0 <= decoded[1] <= len(raw)
2093 self.assertGreaterEqual(decoded[1], 0)
2094 self.assertLessEqual(decoded[1], len(raw))
2095 else:
2096 self.assertRaises(UnicodeDecodeError,
2097 codecs.code_page_decode, cp, raw, errors)
2098
2099 def check_encode(self, cp, tests):
2100 for text, errors, expected in tests:
2101 if expected is not None:
2102 try:
2103 encoded = codecs.code_page_encode(cp, text, errors)
2104 except UnicodeEncodeError as err:
2105 self.fail('Unable to encode %a to "cp%s" with '
2106 'errors=%r: %s' % (text, cp, errors, err))
2107 self.assertEqual(encoded[0], expected,
2108 '%a.encode("cp%s", %r)=%a != %a'
2109 % (text, cp, errors, encoded[0], expected))
2110 self.assertEqual(encoded[1], len(text))
2111 else:
2112 self.assertRaises(UnicodeEncodeError,
2113 codecs.code_page_encode, cp, text, errors)
2114
2115 def test_cp932(self):
2116 self.check_encode(932, (
2117 ('abc', 'strict', b'abc'),
2118 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002119 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002120 ('\xff', 'strict', None),
2121 ('[\xff]', 'ignore', b'[]'),
2122 ('[\xff]', 'replace', b'[y]'),
2123 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002124 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2125 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002126 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002127 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002128 (b'abc', 'strict', 'abc'),
2129 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2130 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002131 (b'[\xff]', 'strict', None),
2132 (b'[\xff]', 'ignore', '[]'),
2133 (b'[\xff]', 'replace', '[\ufffd]'),
2134 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002135 (b'\x81\x00abc', 'strict', None),
2136 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002137 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2138 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002139
2140 def test_cp1252(self):
2141 self.check_encode(1252, (
2142 ('abc', 'strict', b'abc'),
2143 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2144 ('\xff', 'strict', b'\xff'),
2145 ('\u0141', 'strict', None),
2146 ('\u0141', 'ignore', b''),
2147 ('\u0141', 'replace', b'L'),
2148 ))
2149 self.check_decode(1252, (
2150 (b'abc', 'strict', 'abc'),
2151 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2152 (b'\xff', 'strict', '\xff'),
2153 ))
2154
2155 def test_cp_utf7(self):
2156 cp = 65000
2157 self.check_encode(cp, (
2158 ('abc', 'strict', b'abc'),
2159 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2160 ('\U0010ffff', 'strict', b'+2//f/w-'),
2161 ('\udc80', 'strict', b'+3IA-'),
2162 ('\ufffd', 'strict', b'+//0-'),
2163 ))
2164 self.check_decode(cp, (
2165 (b'abc', 'strict', 'abc'),
2166 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2167 (b'+2//f/w-', 'strict', '\U0010ffff'),
2168 (b'+3IA-', 'strict', '\udc80'),
2169 (b'+//0-', 'strict', '\ufffd'),
2170 # invalid bytes
2171 (b'[+/]', 'strict', '[]'),
2172 (b'[\xff]', 'strict', '[\xff]'),
2173 ))
2174
Victor Stinner3a50e702011-10-18 21:21:00 +02002175 def test_multibyte_encoding(self):
2176 self.check_decode(932, (
2177 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2178 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2179 ))
2180 self.check_decode(self.CP_UTF8, (
2181 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2182 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2183 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002184 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002185 self.check_encode(self.CP_UTF8, (
2186 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2187 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2188 ))
2189
2190 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002191 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2192 self.assertEqual(decoded, ('', 0))
2193
Victor Stinner3a50e702011-10-18 21:21:00 +02002194 decoded = codecs.code_page_decode(932,
2195 b'\xe9\x80\xe9', 'strict',
2196 False)
2197 self.assertEqual(decoded, ('\u9a3e', 2))
2198
2199 decoded = codecs.code_page_decode(932,
2200 b'\xe9\x80\xe9\x80', 'strict',
2201 False)
2202 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2203
2204 decoded = codecs.code_page_decode(932,
2205 b'abc', 'strict',
2206 False)
2207 self.assertEqual(decoded, ('abc', 3))
2208
2209
Fred Drake2e2be372001-09-20 21:33:42 +00002210def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002211 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002212 UTF32Test,
2213 UTF32LETest,
2214 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002215 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002216 UTF16LETest,
2217 UTF16BETest,
2218 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002219 UTF8SigTest,
Ezio Melotti26ed2342013-01-11 05:54:57 +02002220 EscapeDecodeTest,
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002221 CP65001Test,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002222 UTF7Test,
2223 UTF16ExTest,
2224 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002225 RecodingTest,
2226 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002227 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002228 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002229 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002230 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002231 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002232 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002233 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002234 CharmapTest,
2235 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002236 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002237 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002238 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002239 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02002240 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002241 )
Fred Drake2e2be372001-09-20 21:33:42 +00002242
2243
2244if __name__ == "__main__":
2245 test_main()