blob: 93660f7a19811f8c47ca3130e9fabd201ec407cc [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
77class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Walter Dörwald41980ca2007-08-16 21:55:45 +0000297class UTF32Test(ReadTest):
298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
333 "\x00\xff\u0100\uffff",
334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
355 ]
356 )
357
Georg Brandl791f4e12009-09-17 11:41:24 +0000358 def test_handlers(self):
359 self.assertEqual(('\ufffd', 1),
360 codecs.utf_32_decode(b'\x01', 'replace', True))
361 self.assertEqual(('', 1),
362 codecs.utf_32_decode(b'\x01', 'ignore', True))
363
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 def test_errors(self):
365 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
366 b"\xff", "strict", True)
367
368 def test_decoder_state(self):
369 self.check_state_handling_decode(self.encoding,
370 "spamspam", self.spamle)
371 self.check_state_handling_decode(self.encoding,
372 "spamspam", self.spambe)
373
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000374 def test_issue8941(self):
375 # Issue #8941: insufficient result allocation when decoding into
376 # surrogate pairs on UCS-2 builds.
377 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
378 self.assertEqual('\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_le)[0])
380 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
381 self.assertEqual('\U00010000' * 1024,
382 codecs.utf_32_decode(encoded_be)[0])
383
Walter Dörwald41980ca2007-08-16 21:55:45 +0000384class UTF32LETest(ReadTest):
385 encoding = "utf-32-le"
386
387 def test_partial(self):
388 self.check_partial(
389 "\x00\xff\u0100\uffff",
390 [
391 "",
392 "",
393 "",
394 "\x00",
395 "\x00",
396 "\x00",
397 "\x00",
398 "\x00\xff",
399 "\x00\xff",
400 "\x00\xff",
401 "\x00\xff",
402 "\x00\xff\u0100",
403 "\x00\xff\u0100",
404 "\x00\xff\u0100",
405 "\x00\xff\u0100",
406 "\x00\xff\u0100\uffff",
407 ]
408 )
409
410 def test_simple(self):
411 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
412
413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
415 b"\xff", "strict", True)
416
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000417 def test_issue8941(self):
418 # Issue #8941: insufficient result allocation when decoding into
419 # surrogate pairs on UCS-2 builds.
420 encoded = b'\x00\x00\x01\x00' * 1024
421 self.assertEqual('\U00010000' * 1024,
422 codecs.utf_32_le_decode(encoded)[0])
423
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424class UTF32BETest(ReadTest):
425 encoding = "utf-32-be"
426
427 def test_partial(self):
428 self.check_partial(
429 "\x00\xff\u0100\uffff",
430 [
431 "",
432 "",
433 "",
434 "\x00",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00\xff",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100\uffff",
447 ]
448 )
449
450 def test_simple(self):
451 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
452
453 def test_errors(self):
454 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
455 b"\xff", "strict", True)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded = b'\x00\x01\x00\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_be_decode(encoded)[0])
463
464
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000465class UTF16Test(ReadTest):
466 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000468 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
469 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470
471 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200475 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 f.write("spam")
477 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 d = s.getvalue()
479 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000480 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200483 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000484 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000486 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000493 self.assertRaises(UnicodeError, f.read)
494
Walter Dörwald69652032004-09-07 20:24:22 +0000495 def test_partial(self):
496 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000497 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000498 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000499 "", # first byte of BOM read
500 "", # second byte of BOM read => byteorder known
501 "",
502 "\x00",
503 "\x00",
504 "\x00\xff",
505 "\x00\xff",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000509 ]
510 )
511
Georg Brandl791f4e12009-09-17 11:41:24 +0000512 def test_handlers(self):
513 self.assertEqual(('\ufffd', 1),
514 codecs.utf_16_decode(b'\x01', 'replace', True))
515 self.assertEqual(('', 1),
516 codecs.utf_16_decode(b'\x01', 'ignore', True))
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000520 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000521
522 def test_decoder_state(self):
523 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000524 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000525 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000528 def test_bug691291(self):
529 # Files are always opened in binary mode, even if no binary mode was
530 # specified. This means that no automatic conversion of '\n' is done
531 # on reading and writing.
532 s1 = 'Hello\r\nworld\r\n'
533
534 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200535 self.addCleanup(support.unlink, support.TESTFN)
536 with open(support.TESTFN, 'wb') as fp:
537 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200538 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200539 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000540
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000541class UTF16LETest(ReadTest):
542 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000543
544 def test_partial(self):
545 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000547 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 "",
549 "\x00",
550 "\x00",
551 "\x00\xff",
552 "\x00\xff",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200560 tests = [
561 (b'\xff', '\ufffd'),
562 (b'A\x00Z', 'A\ufffd'),
563 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
564 (b'\x00\xd8', '\ufffd'),
565 (b'\x00\xd8A', '\ufffd'),
566 (b'\x00\xd8A\x00', '\ufffdA'),
567 (b'\x00\xdcA\x00', '\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Victor Stinner53a9dd72010-12-08 22:25:45 +0000574 def test_nonbmp(self):
575 self.assertEqual("\U00010203".encode(self.encoding),
576 b'\x00\xd8\x03\xde')
577 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
578 "\U00010203")
579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 "",
588 "\x00",
589 "\x00",
590 "\x00\xff",
591 "\x00\xff",
592 "\x00\xff\u0100",
593 "\x00\xff\u0100",
594 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000595 ]
596 )
597
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200599 tests = [
600 (b'\xff', '\ufffd'),
601 (b'\x00A\xff', 'A\ufffd'),
602 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
603 (b'\xd8\x00', '\ufffd'),
604 (b'\xd8\x00\xdc', '\ufffd'),
605 (b'\xd8\x00\x00A', '\ufffdA'),
606 (b'\xdc\x00\x00A', '\ufffdA'),
607 ]
608 for raw, expected in tests:
609 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
610 raw, 'strict', True)
611 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000612
Victor Stinner53a9dd72010-12-08 22:25:45 +0000613 def test_nonbmp(self):
614 self.assertEqual("\U00010203".encode(self.encoding),
615 b'\xd8\x00\xde\x03')
616 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
617 "\U00010203")
618
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000619class UTF8Test(ReadTest):
620 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000621
622 def test_partial(self):
623 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000625 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000626 "\x00",
627 "\x00",
628 "\x00\xff",
629 "\x00\xff",
630 "\x00\xff\u07ff",
631 "\x00\xff\u07ff",
632 "\x00\xff\u07ff",
633 "\x00\xff\u07ff\u0800",
634 "\x00\xff\u07ff\u0800",
635 "\x00\xff\u07ff\u0800",
636 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000637 ]
638 )
639
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000642 self.check_state_handling_decode(self.encoding,
643 u, u.encode(self.encoding))
644
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000645 def test_lone_surrogates(self):
646 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
647 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000648 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
649 b'[\\udc80]')
650 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
651 b'[&#56448;]')
652 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
653 b'[\x80]')
654 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
655 b'[]')
656 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
657 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000658
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000659 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000660 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
661 b"abc\xed\xa0\x80def")
662 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
663 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
665 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
666 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
667 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000668 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700669 with self.assertRaises(UnicodeDecodeError):
670 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000671
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200672@unittest.skipUnless(sys.platform == 'win32',
673 'cp65001 is a Windows-only codec')
674class CP65001Test(ReadTest):
675 encoding = "cp65001"
676
677 def test_encode(self):
678 tests = [
679 ('abc', 'strict', b'abc'),
680 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
681 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
682 ]
683 if VISTA_OR_LATER:
684 tests.extend((
685 ('\udc80', 'strict', None),
686 ('\udc80', 'ignore', b''),
687 ('\udc80', 'replace', b'?'),
688 ('\udc80', 'backslashreplace', b'\\udc80'),
689 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
690 ))
691 else:
692 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
693 for text, errors, expected in tests:
694 if expected is not None:
695 try:
696 encoded = text.encode('cp65001', errors)
697 except UnicodeEncodeError as err:
698 self.fail('Unable to encode %a to cp65001 with '
699 'errors=%r: %s' % (text, errors, err))
700 self.assertEqual(encoded, expected,
701 '%a.encode("cp65001", %r)=%a != %a'
702 % (text, errors, encoded, expected))
703 else:
704 self.assertRaises(UnicodeEncodeError,
705 text.encode, "cp65001", errors)
706
707 def test_decode(self):
708 tests = [
709 (b'abc', 'strict', 'abc'),
710 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
711 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
712 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
713 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
714 # invalid bytes
715 (b'[\xff]', 'strict', None),
716 (b'[\xff]', 'ignore', '[]'),
717 (b'[\xff]', 'replace', '[\ufffd]'),
718 (b'[\xff]', 'surrogateescape', '[\udcff]'),
719 ]
720 if VISTA_OR_LATER:
721 tests.extend((
722 (b'[\xed\xb2\x80]', 'strict', None),
723 (b'[\xed\xb2\x80]', 'ignore', '[]'),
724 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
725 ))
726 else:
727 tests.extend((
728 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
729 ))
730 for raw, errors, expected in tests:
731 if expected is not None:
732 try:
733 decoded = raw.decode('cp65001', errors)
734 except UnicodeDecodeError as err:
735 self.fail('Unable to decode %a from cp65001 with '
736 'errors=%r: %s' % (raw, errors, err))
737 self.assertEqual(decoded, expected,
738 '%a.decode("cp65001", %r)=%a != %a'
739 % (raw, errors, decoded, expected))
740 else:
741 self.assertRaises(UnicodeDecodeError,
742 raw.decode, 'cp65001', errors)
743
744 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
745 def test_lone_surrogates(self):
746 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
747 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
748 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
749 b'[\\udc80]')
750 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
751 b'[&#56448;]')
752 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
753 b'[\x80]')
754 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
755 b'[]')
756 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
757 b'[?]')
758
759 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
760 def test_surrogatepass_handler(self):
761 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
762 b"abc\xed\xa0\x80def")
763 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
764 "abc\ud800def")
765 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
766 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
767 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
768 "\U00010fff\uD800")
769 self.assertTrue(codecs.lookup_error("surrogatepass"))
770
771
772
Walter Dörwalde22d3392005-11-17 08:52:34 +0000773class UTF7Test(ReadTest):
774 encoding = "utf-7"
775
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000776 def test_partial(self):
777 self.check_partial(
778 "a+-b",
779 [
780 "a",
781 "a",
782 "a+",
783 "a+-",
784 "a+-b",
785 ]
786 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000787
788class UTF16ExTest(unittest.TestCase):
789
790 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000791 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000792
793 def test_bad_args(self):
794 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
795
796class ReadBufferTest(unittest.TestCase):
797
798 def test_array(self):
799 import array
800 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000801 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000802 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000803 )
804
805 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000806 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000807
808 def test_bad_args(self):
809 self.assertRaises(TypeError, codecs.readbuffer_encode)
810 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
811
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000812class UTF8SigTest(ReadTest):
813 encoding = "utf-8-sig"
814
815 def test_partial(self):
816 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000817 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000818 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000819 "",
820 "",
821 "", # First BOM has been read and skipped
822 "",
823 "",
824 "\ufeff", # Second BOM has been read and emitted
825 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000826 "\ufeff\x00", # First byte of encoded "\xff" read
827 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
828 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
829 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000830 "\ufeff\x00\xff\u07ff",
831 "\ufeff\x00\xff\u07ff",
832 "\ufeff\x00\xff\u07ff\u0800",
833 "\ufeff\x00\xff\u07ff\u0800",
834 "\ufeff\x00\xff\u07ff\u0800",
835 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000836 ]
837 )
838
Thomas Wouters89f507f2006-12-13 04:49:30 +0000839 def test_bug1601501(self):
840 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000841 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000842
Walter Dörwald3abcb012007-04-16 22:10:50 +0000843 def test_bom(self):
844 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000845 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000846 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
847
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000848 def test_stream_bom(self):
849 unistring = "ABC\u00A1\u2200XYZ"
850 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
851
852 reader = codecs.getreader("utf-8-sig")
853 for sizehint in [None] + list(range(1, 11)) + \
854 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200855 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000856 ostream = io.StringIO()
857 while 1:
858 if sizehint is not None:
859 data = istream.read(sizehint)
860 else:
861 data = istream.read()
862
863 if not data:
864 break
865 ostream.write(data)
866
867 got = ostream.getvalue()
868 self.assertEqual(got, unistring)
869
870 def test_stream_bare(self):
871 unistring = "ABC\u00A1\u2200XYZ"
872 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
873
874 reader = codecs.getreader("utf-8-sig")
875 for sizehint in [None] + list(range(1, 11)) + \
876 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200877 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000878 ostream = io.StringIO()
879 while 1:
880 if sizehint is not None:
881 data = istream.read(sizehint)
882 else:
883 data = istream.read()
884
885 if not data:
886 break
887 ostream.write(data)
888
889 got = ostream.getvalue()
890 self.assertEqual(got, unistring)
891
892class EscapeDecodeTest(unittest.TestCase):
893 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000894 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000895
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000896class RecodingTest(unittest.TestCase):
897 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000898 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200899 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000900 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000901 f2.close()
902 # Python used to crash on this at exit because of a refcount
903 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000904
Martin v. Löwis2548c732003-04-18 10:39:54 +0000905# From RFC 3492
906punycode_testcases = [
907 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000908 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
909 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000910 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000911 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000912 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000913 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000914 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000915 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000916 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000917 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000918 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
919 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
920 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000921 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000922 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000923 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
924 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
925 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000926 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000927 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000928 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000929 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
930 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
931 "\u0939\u0948\u0902",
932 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000933
934 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000935 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000936 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
937 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000938
939 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000940 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
941 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
942 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000943 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
944 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945
946 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000947 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
948 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
949 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
950 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000951 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000952
953 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000954 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
955 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
956 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
957 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
958 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000959 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960
961 # (K) Vietnamese:
962 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
963 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000964 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
965 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
966 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
967 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000968 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000969
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000971 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000972 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000973
Martin v. Löwis2548c732003-04-18 10:39:54 +0000974 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000975 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
976 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
977 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000978 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979
980 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000981 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
982 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
983 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000984 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985
986 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000987 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000988 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000989
990 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000991 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
992 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000993 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994
995 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000996 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000997 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000998
999 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001000 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001001 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001002
1003 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001004 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1005 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001006 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007 ]
1008
1009for i in punycode_testcases:
1010 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001011 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001012
1013class PunycodeTest(unittest.TestCase):
1014 def test_encode(self):
1015 for uni, puny in punycode_testcases:
1016 # Need to convert both strings to lower case, since
1017 # some of the extended encodings use upper case, but our
1018 # code produces only lower case. Converting just puny to
1019 # lower is also insufficient, since some of the input characters
1020 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001021 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001022 str(uni.encode("punycode"), "ascii").lower(),
1023 str(puny, "ascii").lower()
1024 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001025
1026 def test_decode(self):
1027 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001028 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001029 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001030 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001031
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001032class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001033 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001034 def test_bug1251300(self):
1035 # Decoding with unicode_internal used to not correctly handle "code
1036 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001037 ok = [
1038 (b"\x00\x10\xff\xff", "\U0010ffff"),
1039 (b"\x00\x00\x01\x01", "\U00000101"),
1040 (b"", ""),
1041 ]
1042 not_ok = [
1043 b"\x7f\xff\xff\xff",
1044 b"\x80\x00\x00\x00",
1045 b"\x81\x00\x00\x00",
1046 b"\x00",
1047 b"\x00\x00\x00\x00\x00",
1048 ]
1049 for internal, uni in ok:
1050 if sys.byteorder == "little":
1051 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001052 with support.check_warnings():
1053 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001054 for internal in not_ok:
1055 if sys.byteorder == "little":
1056 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001057 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001058 'deprecated', DeprecationWarning)):
1059 self.assertRaises(UnicodeDecodeError, internal.decode,
1060 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001061 if sys.byteorder == "little":
1062 invalid = b"\x00\x00\x11\x00"
1063 else:
1064 invalid = b"\x00\x11\x00\x00"
1065 with support.check_warnings():
1066 self.assertRaises(UnicodeDecodeError,
1067 invalid.decode, "unicode_internal")
1068 with support.check_warnings():
1069 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1070 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001071
Victor Stinner182d90d2011-09-29 19:53:55 +02001072 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001073 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001074 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001075 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001076 'deprecated', DeprecationWarning)):
1077 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001078 except UnicodeDecodeError as ex:
1079 self.assertEqual("unicode_internal", ex.encoding)
1080 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1081 self.assertEqual(4, ex.start)
1082 self.assertEqual(8, ex.end)
1083 else:
1084 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001085
Victor Stinner182d90d2011-09-29 19:53:55 +02001086 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001087 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001088 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1089 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001090 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001091 'deprecated', DeprecationWarning)):
1092 ab = "ab".encode("unicode_internal").decode()
1093 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1094 "ascii"),
1095 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001096 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001097
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001098 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001099 with support.check_warnings(('unicode_internal codec has been '
1100 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001101 # Issue 3739
1102 encoder = codecs.getencoder("unicode_internal")
1103 self.assertEqual(encoder("a")[1], 1)
1104 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1105
1106 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001107
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1109nameprep_tests = [
1110 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001111 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1112 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1113 b'\xb8\x8f\xef\xbb\xbf',
1114 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001116 (b'CAFE',
1117 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118 # 3.3 Case folding 8bit U+00DF (german sharp s).
1119 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001120 (b'\xc3\x9f',
1121 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001123 (b'\xc4\xb0',
1124 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 (b'\xc5\x83\xcd\xba',
1127 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001128 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1129 # XXX: skip this as it fails in UCS-2 mode
1130 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1131 # 'telc\xe2\x88\x95kg\xcf\x83'),
1132 (None, None),
1133 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001134 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1135 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001136 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001137 (b'\xe1\xbe\xb7',
1138 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001139 # 3.9 Self-reverting case folding U+01F0 and normalization.
1140 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001141 (b'\xc7\xb0',
1142 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001144 (b'\xce\x90',
1145 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001146 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 (b'\xce\xb0',
1148 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001150 (b'\xe1\xba\x96',
1151 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001153 (b'\xe1\xbd\x96',
1154 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001155 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001156 (b' ',
1157 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001158 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 (b'\xc2\xa0',
1160 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001162 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163 None),
1164 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001165 (b'\xe2\x80\x80',
1166 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001167 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 (b'\xe2\x80\x8b',
1169 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 (b'\xe3\x80\x80',
1172 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001174 (b'\x10\x7f',
1175 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001176 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178 None),
1179 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001180 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001181 None),
1182 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001183 (b'\xef\xbb\xbf',
1184 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001186 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187 None),
1188 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001189 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001190 None),
1191 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001192 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193 None),
1194 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001195 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196 None),
1197 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001198 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001199 None),
1200 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001201 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001202 None),
1203 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001204 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205 None),
1206 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001207 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208 None),
1209 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001210 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001211 None),
1212 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001213 (b'\xcd\x81',
1214 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001216 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217 None),
1218 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001219 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001220 None),
1221 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001222 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223 None),
1224 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001225 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001226 None),
1227 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001228 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229 None),
1230 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001231 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001232 None),
1233 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001234 (b'foo\xef\xb9\xb6bar',
1235 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001237 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238 None),
1239 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001240 (b'\xd8\xa71\xd8\xa8',
1241 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001242 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001243 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001244 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001245 # None),
1246 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247 # 3.44 Larger test (shrinking).
1248 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001249 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1250 b'\xaa\xce\xb0\xe2\x80\x80',
1251 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252 # 3.45 Larger test (expanding).
1253 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001254 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1255 b'\x80',
1256 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1257 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1258 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001259 ]
1260
1261
1262class NameprepTest(unittest.TestCase):
1263 def test_nameprep(self):
1264 from encodings.idna import nameprep
1265 for pos, (orig, prepped) in enumerate(nameprep_tests):
1266 if orig is None:
1267 # Skipped
1268 continue
1269 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001270 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001271 if prepped is None:
1272 # Input contains prohibited characters
1273 self.assertRaises(UnicodeError, nameprep, orig)
1274 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001275 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001276 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001277 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001278 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001279 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001280
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001281class IDNACodecTest(unittest.TestCase):
1282 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001283 self.assertEqual(str(b"python.org", "idna"), "python.org")
1284 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1285 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1286 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001287
1288 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001289 self.assertEqual("python.org".encode("idna"), b"python.org")
1290 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1291 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1292 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001293
Martin v. Löwis8b595142005-08-25 11:03:38 +00001294 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001295 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001296 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001297 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001298
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001299 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001300 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001301 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001302 "python.org"
1303 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001304 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001305 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001306 "python.org."
1307 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001308 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001309 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001310 "pyth\xf6n.org."
1311 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001312 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001313 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001314 "pyth\xf6n.org."
1315 )
1316
1317 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001318 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1319 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1320 self.assertEqual(decoder.decode(b"rg"), "")
1321 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001322
1323 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001324 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1325 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1326 self.assertEqual(decoder.decode(b"rg."), "org.")
1327 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001328
1329 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001330 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001331 b"".join(codecs.iterencode("python.org", "idna")),
1332 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001333 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001334 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001335 b"".join(codecs.iterencode("python.org.", "idna")),
1336 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001337 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001338 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001339 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1340 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001341 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001342 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001343 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1344 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001345 )
1346
1347 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001348 self.assertEqual(encoder.encode("\xe4x"), b"")
1349 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1350 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001351
1352 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001353 self.assertEqual(encoder.encode("\xe4x"), b"")
1354 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1355 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001356
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001357class CodecsModuleTest(unittest.TestCase):
1358
1359 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001360 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1361 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001362 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001363 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001364 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001365
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001366 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001367 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1368 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001369 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001370 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001371 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001372 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001373
1374 def test_register(self):
1375 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001376 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001377
1378 def test_lookup(self):
1379 self.assertRaises(TypeError, codecs.lookup)
1380 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001381 self.assertRaises(LookupError, codecs.lookup, " ")
1382
1383 def test_getencoder(self):
1384 self.assertRaises(TypeError, codecs.getencoder)
1385 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1386
1387 def test_getdecoder(self):
1388 self.assertRaises(TypeError, codecs.getdecoder)
1389 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1390
1391 def test_getreader(self):
1392 self.assertRaises(TypeError, codecs.getreader)
1393 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1394
1395 def test_getwriter(self):
1396 self.assertRaises(TypeError, codecs.getwriter)
1397 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001398
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001399 def test_lookup_issue1813(self):
1400 # Issue #1813: under Turkish locales, lookup of some codecs failed
1401 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001402 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001403 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1404 try:
1405 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1406 except locale.Error:
1407 # Unsupported locale on this system
1408 self.skipTest('test needs Turkish locale')
1409 c = codecs.lookup('ASCII')
1410 self.assertEqual(c.name, 'ascii')
1411
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001412class StreamReaderTest(unittest.TestCase):
1413
1414 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001415 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001416 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001417
1418 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001419 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001420 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001421
Thomas Wouters89f507f2006-12-13 04:49:30 +00001422class EncodedFileTest(unittest.TestCase):
1423
1424 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001425 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001426 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001427 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001428
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001429 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001430 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001431 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001432 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001433
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001434all_unicode_encodings = [
1435 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001436 "big5",
1437 "big5hkscs",
1438 "charmap",
1439 "cp037",
1440 "cp1006",
1441 "cp1026",
1442 "cp1140",
1443 "cp1250",
1444 "cp1251",
1445 "cp1252",
1446 "cp1253",
1447 "cp1254",
1448 "cp1255",
1449 "cp1256",
1450 "cp1257",
1451 "cp1258",
1452 "cp424",
1453 "cp437",
1454 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001455 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001456 "cp737",
1457 "cp775",
1458 "cp850",
1459 "cp852",
1460 "cp855",
1461 "cp856",
1462 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001463 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001464 "cp860",
1465 "cp861",
1466 "cp862",
1467 "cp863",
1468 "cp864",
1469 "cp865",
1470 "cp866",
1471 "cp869",
1472 "cp874",
1473 "cp875",
1474 "cp932",
1475 "cp949",
1476 "cp950",
1477 "euc_jis_2004",
1478 "euc_jisx0213",
1479 "euc_jp",
1480 "euc_kr",
1481 "gb18030",
1482 "gb2312",
1483 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001484 "hp_roman8",
1485 "hz",
1486 "idna",
1487 "iso2022_jp",
1488 "iso2022_jp_1",
1489 "iso2022_jp_2",
1490 "iso2022_jp_2004",
1491 "iso2022_jp_3",
1492 "iso2022_jp_ext",
1493 "iso2022_kr",
1494 "iso8859_1",
1495 "iso8859_10",
1496 "iso8859_11",
1497 "iso8859_13",
1498 "iso8859_14",
1499 "iso8859_15",
1500 "iso8859_16",
1501 "iso8859_2",
1502 "iso8859_3",
1503 "iso8859_4",
1504 "iso8859_5",
1505 "iso8859_6",
1506 "iso8859_7",
1507 "iso8859_8",
1508 "iso8859_9",
1509 "johab",
1510 "koi8_r",
1511 "koi8_u",
1512 "latin_1",
1513 "mac_cyrillic",
1514 "mac_greek",
1515 "mac_iceland",
1516 "mac_latin2",
1517 "mac_roman",
1518 "mac_turkish",
1519 "palmos",
1520 "ptcp154",
1521 "punycode",
1522 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001523 "shift_jis",
1524 "shift_jis_2004",
1525 "shift_jisx0213",
1526 "tis_620",
1527 "unicode_escape",
1528 "unicode_internal",
1529 "utf_16",
1530 "utf_16_be",
1531 "utf_16_le",
1532 "utf_7",
1533 "utf_8",
1534]
1535
1536if hasattr(codecs, "mbcs_encode"):
1537 all_unicode_encodings.append("mbcs")
1538
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001539# The following encoding is not tested, because it's not supposed
1540# to work:
1541# "undefined"
1542
1543# The following encodings don't work in stateful mode
1544broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001545 "punycode",
1546 "unicode_internal"
1547]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001548broken_incremental_coders = broken_unicode_with_streams + [
1549 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001550]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001551
Walter Dörwald3abcb012007-04-16 22:10:50 +00001552class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001553 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001554 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001555 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001556 name = codecs.lookup(encoding).name
1557 if encoding.endswith("_codec"):
1558 name += "_codec"
1559 elif encoding == "latin_1":
1560 name = "latin_1"
1561 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001562
Ezio Melottiadc417c2011-11-17 12:23:34 +02001563 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001564 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001565 (b, size) = codecs.getencoder(encoding)(s)
1566 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1567 (chars, size) = codecs.getdecoder(encoding)(b)
1568 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001569
1570 if encoding not in broken_unicode_with_streams:
1571 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001572 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001573 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001574 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001575 for c in s:
1576 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001577 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001578 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001579 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001580 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001581 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001582 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001583 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001584 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001585 decodedresult += reader.read()
1586 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1587
Thomas Wouters89f507f2006-12-13 04:49:30 +00001588 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001589 # check incremental decoder/encoder (fetched via the Python
1590 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001591 try:
1592 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001593 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001594 except LookupError: # no IncrementalEncoder
1595 pass
1596 else:
1597 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001598 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001599 for c in s:
1600 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001601 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001602 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001603 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001604 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001605 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001606 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001607 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1608
1609 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001610 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001611 for c in s:
1612 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001613 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001614 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001615 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001616 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001617 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001618 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001619 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1620
1621 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001622 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001623 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1624
1625 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001626 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1627 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001628
Victor Stinner554f3f02010-06-16 23:33:54 +00001629 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001630 # check incremental decoder/encoder with errors argument
1631 try:
1632 encoder = codecs.getincrementalencoder(encoding)("ignore")
1633 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1634 except LookupError: # no IncrementalEncoder
1635 pass
1636 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001637 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001638 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001639 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001640 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1641
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001642 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001643 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001644 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001645 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1646
Walter Dörwald729c31f2005-03-14 19:06:30 +00001647 def test_seek(self):
1648 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001649 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001650 for encoding in all_unicode_encodings:
1651 if encoding == "idna": # FIXME: See SF bug #1163178
1652 continue
1653 if encoding in broken_unicode_with_streams:
1654 continue
Victor Stinner05010702011-05-27 16:50:40 +02001655 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001656 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001657 # Test that calling seek resets the internal codec state and buffers
1658 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001659 data = reader.read()
1660 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001661
Walter Dörwalde22d3392005-11-17 08:52:34 +00001662 def test_bad_decode_args(self):
1663 for encoding in all_unicode_encodings:
1664 decoder = codecs.getdecoder(encoding)
1665 self.assertRaises(TypeError, decoder)
1666 if encoding not in ("idna", "punycode"):
1667 self.assertRaises(TypeError, decoder, 42)
1668
1669 def test_bad_encode_args(self):
1670 for encoding in all_unicode_encodings:
1671 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001672 with support.check_warnings():
1673 # unicode-internal has been deprecated
1674 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001675
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001676 def test_encoding_map_type_initialized(self):
1677 from encodings import cp1140
1678 # This used to crash, we are only verifying there's no crash.
1679 table_type = type(cp1140.encoding_table)
1680 self.assertEqual(table_type, table_type)
1681
Walter Dörwald3abcb012007-04-16 22:10:50 +00001682 def test_decoder_state(self):
1683 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001684 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001685 for encoding in all_unicode_encodings:
1686 if encoding not in broken_incremental_coders:
1687 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1688 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1689
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001690class CharmapTest(unittest.TestCase):
1691 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001692 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001693 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001694 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001695 )
1696
Ezio Melottib3aedd42010-11-20 19:04:17 +00001697 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001698 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1699 ("\U0010FFFFbc", 3)
1700 )
1701
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001702 self.assertRaises(UnicodeDecodeError,
1703 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1704 )
1705
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001706 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001707 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001708 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001709 )
1710
Ezio Melottib3aedd42010-11-20 19:04:17 +00001711 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001712 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001713 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001714 )
1715
Ezio Melottib3aedd42010-11-20 19:04:17 +00001716 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001717 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001718 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001719 )
1720
Ezio Melottib3aedd42010-11-20 19:04:17 +00001721 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001722 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001723 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001724 )
1725
Guido van Rossum805365e2007-05-07 22:24:25 +00001726 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001727 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001728 codecs.charmap_decode(allbytes, "ignore", ""),
1729 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001730 )
1731
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001732 def test_decode_with_int2str_map(self):
1733 self.assertEqual(
1734 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1735 {0: 'a', 1: 'b', 2: 'c'}),
1736 ("abc", 3)
1737 )
1738
1739 self.assertEqual(
1740 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1741 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1742 ("AaBbCc", 3)
1743 )
1744
1745 self.assertEqual(
1746 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1747 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1748 ("\U0010FFFFbc", 3)
1749 )
1750
1751 self.assertEqual(
1752 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1753 {0: 'a', 1: 'b', 2: ''}),
1754 ("ab", 3)
1755 )
1756
1757 self.assertRaises(UnicodeDecodeError,
1758 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1759 {0: 'a', 1: 'b'}
1760 )
1761
1762 self.assertEqual(
1763 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1764 {0: 'a', 1: 'b'}),
1765 ("ab\ufffd", 3)
1766 )
1767
1768 self.assertEqual(
1769 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1770 {0: 'a', 1: 'b', 2: None}),
1771 ("ab\ufffd", 3)
1772 )
1773
1774 self.assertEqual(
1775 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1776 {0: 'a', 1: 'b'}),
1777 ("ab", 3)
1778 )
1779
1780 self.assertEqual(
1781 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1782 {0: 'a', 1: 'b', 2: None}),
1783 ("ab", 3)
1784 )
1785
1786 allbytes = bytes(range(256))
1787 self.assertEqual(
1788 codecs.charmap_decode(allbytes, "ignore", {}),
1789 ("", len(allbytes))
1790 )
1791
1792 def test_decode_with_int2int_map(self):
1793 a = ord('a')
1794 b = ord('b')
1795 c = ord('c')
1796
1797 self.assertEqual(
1798 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1799 {0: a, 1: b, 2: c}),
1800 ("abc", 3)
1801 )
1802
1803 # Issue #15379
1804 self.assertEqual(
1805 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1806 {0: 0x10FFFF, 1: b, 2: c}),
1807 ("\U0010FFFFbc", 3)
1808 )
1809
Antoine Pitroua1f76552012-09-23 20:00:04 +02001810 self.assertEqual(
1811 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1812 {0: sys.maxunicode, 1: b, 2: c}),
1813 (chr(sys.maxunicode) + "bc", 3)
1814 )
1815
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001816 self.assertRaises(TypeError,
1817 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001818 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001819 )
1820
1821 self.assertRaises(UnicodeDecodeError,
1822 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1823 {0: a, 1: b},
1824 )
1825
1826 self.assertEqual(
1827 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1828 {0: a, 1: b}),
1829 ("ab\ufffd", 3)
1830 )
1831
1832 self.assertEqual(
1833 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1834 {0: a, 1: b}),
1835 ("ab", 3)
1836 )
1837
1838
Thomas Wouters89f507f2006-12-13 04:49:30 +00001839class WithStmtTest(unittest.TestCase):
1840 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001841 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001842 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1843 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001844
1845 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001846 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001847 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001848 with codecs.StreamReaderWriter(f, info.streamreader,
1849 info.streamwriter, 'strict') as srw:
1850 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001851
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001852class TypesTest(unittest.TestCase):
1853 def test_decode_unicode(self):
1854 # Most decoders don't accept unicode input
1855 decoders = [
1856 codecs.utf_7_decode,
1857 codecs.utf_8_decode,
1858 codecs.utf_16_le_decode,
1859 codecs.utf_16_be_decode,
1860 codecs.utf_16_ex_decode,
1861 codecs.utf_32_decode,
1862 codecs.utf_32_le_decode,
1863 codecs.utf_32_be_decode,
1864 codecs.utf_32_ex_decode,
1865 codecs.latin_1_decode,
1866 codecs.ascii_decode,
1867 codecs.charmap_decode,
1868 ]
1869 if hasattr(codecs, "mbcs_decode"):
1870 decoders.append(codecs.mbcs_decode)
1871 for decoder in decoders:
1872 self.assertRaises(TypeError, decoder, "xxx")
1873
1874 def test_unicode_escape(self):
1875 # Escape-decoding an unicode string is supported ang gives the same
1876 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001877 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1878 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1879 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1880 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001881
Victor Stinnere3b47152011-12-09 20:49:49 +01001882 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
1883 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1884
1885 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
1886 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1887
Martin v. Löwis43c57782009-05-10 08:15:24 +00001888class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001889
1890 def test_utf8(self):
1891 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001892 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001893 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001894 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001895 b"foo\x80bar")
1896 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001897 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001898 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001899 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001900 b"\xed\xb0\x80")
1901
1902 def test_ascii(self):
1903 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001904 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001906 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001907 b"foo\x80bar")
1908
1909 def test_charmap(self):
1910 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001911 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001912 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001913 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001914 b"foo\xa5bar")
1915
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001916 def test_latin1(self):
1917 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001918 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001919 b"\xe4\xeb\xef\xf6\xfc")
1920
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001921
Victor Stinner3fed0872010-05-22 02:16:27 +00001922class BomTest(unittest.TestCase):
1923 def test_seek0(self):
1924 data = "1234567890"
1925 tests = ("utf-16",
1926 "utf-16-le",
1927 "utf-16-be",
1928 "utf-32",
1929 "utf-32-le",
1930 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001931 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001932 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001933 # Check if the BOM is written only once
1934 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001935 f.write(data)
1936 f.write(data)
1937 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001938 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001939 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001940 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001941
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001942 # Check that the BOM is written after a seek(0)
1943 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1944 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001945 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001946 f.seek(0)
1947 f.write(data)
1948 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001949 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001950
1951 # (StreamWriter) Check that the BOM is written after a seek(0)
1952 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001953 f.writer.write(data[0])
1954 self.assertNotEqual(f.writer.tell(), 0)
1955 f.writer.seek(0)
1956 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001957 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001958 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001959
Victor Stinner05010702011-05-27 16:50:40 +02001960 # Check that the BOM is not written after a seek() at a position
1961 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001962 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1963 f.write(data)
1964 f.seek(f.tell())
1965 f.write(data)
1966 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001967 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001968
Victor Stinner05010702011-05-27 16:50:40 +02001969 # (StreamWriter) Check that the BOM is not written after a seek()
1970 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001971 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001972 f.writer.write(data)
1973 f.writer.seek(f.writer.tell())
1974 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001975 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001976 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001977
Victor Stinner3fed0872010-05-22 02:16:27 +00001978
Georg Brandl02524622010-12-02 18:06:51 +00001979bytes_transform_encodings = [
1980 "base64_codec",
1981 "uu_codec",
1982 "quopri_codec",
1983 "hex_codec",
1984]
1985try:
1986 import zlib
1987except ImportError:
1988 pass
1989else:
1990 bytes_transform_encodings.append("zlib_codec")
1991try:
1992 import bz2
1993except ImportError:
1994 pass
1995else:
1996 bytes_transform_encodings.append("bz2_codec")
1997
1998class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001999
Georg Brandl02524622010-12-02 18:06:51 +00002000 def test_basics(self):
2001 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002002 for encoding in bytes_transform_encodings:
2003 # generic codecs interface
2004 (o, size) = codecs.getencoder(encoding)(binput)
2005 self.assertEqual(size, len(binput))
2006 (i, size) = codecs.getdecoder(encoding)(o)
2007 self.assertEqual(size, len(o))
2008 self.assertEqual(i, binput)
2009
Georg Brandl02524622010-12-02 18:06:51 +00002010 def test_read(self):
2011 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002012 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002013 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002014 sout = reader.read()
2015 self.assertEqual(sout, b"\x80")
2016
2017 def test_readline(self):
2018 for encoding in bytes_transform_encodings:
2019 if encoding in ['uu_codec', 'zlib_codec']:
2020 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002021 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002022 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002023 sout = reader.readline()
2024 self.assertEqual(sout, b"\x80")
2025
2026
Victor Stinner62be4fb2011-10-18 21:46:37 +02002027@unittest.skipUnless(sys.platform == 'win32',
2028 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002029class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002030 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002031 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002032
Victor Stinner3a50e702011-10-18 21:21:00 +02002033 def test_invalid_code_page(self):
2034 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2035 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2036 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2037 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2038
2039 def test_code_page_name(self):
2040 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2041 codecs.code_page_encode, 932, '\xff')
2042 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2043 codecs.code_page_decode, 932, b'\x81\x00')
2044 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2045 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2046
2047 def check_decode(self, cp, tests):
2048 for raw, errors, expected in tests:
2049 if expected is not None:
2050 try:
2051 decoded = codecs.code_page_decode(cp, raw, errors)
2052 except UnicodeDecodeError as err:
2053 self.fail('Unable to decode %a from "cp%s" with '
2054 'errors=%r: %s' % (raw, cp, errors, err))
2055 self.assertEqual(decoded[0], expected,
2056 '%a.decode("cp%s", %r)=%a != %a'
2057 % (raw, cp, errors, decoded[0], expected))
2058 # assert 0 <= decoded[1] <= len(raw)
2059 self.assertGreaterEqual(decoded[1], 0)
2060 self.assertLessEqual(decoded[1], len(raw))
2061 else:
2062 self.assertRaises(UnicodeDecodeError,
2063 codecs.code_page_decode, cp, raw, errors)
2064
2065 def check_encode(self, cp, tests):
2066 for text, errors, expected in tests:
2067 if expected is not None:
2068 try:
2069 encoded = codecs.code_page_encode(cp, text, errors)
2070 except UnicodeEncodeError as err:
2071 self.fail('Unable to encode %a to "cp%s" with '
2072 'errors=%r: %s' % (text, cp, errors, err))
2073 self.assertEqual(encoded[0], expected,
2074 '%a.encode("cp%s", %r)=%a != %a'
2075 % (text, cp, errors, encoded[0], expected))
2076 self.assertEqual(encoded[1], len(text))
2077 else:
2078 self.assertRaises(UnicodeEncodeError,
2079 codecs.code_page_encode, cp, text, errors)
2080
2081 def test_cp932(self):
2082 self.check_encode(932, (
2083 ('abc', 'strict', b'abc'),
2084 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002085 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002086 ('\xff', 'strict', None),
2087 ('[\xff]', 'ignore', b'[]'),
2088 ('[\xff]', 'replace', b'[y]'),
2089 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002090 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2091 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002092 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002093 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002094 (b'abc', 'strict', 'abc'),
2095 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2096 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002097 (b'[\xff]', 'strict', None),
2098 (b'[\xff]', 'ignore', '[]'),
2099 (b'[\xff]', 'replace', '[\ufffd]'),
2100 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002101 (b'\x81\x00abc', 'strict', None),
2102 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002103 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2104 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002105
2106 def test_cp1252(self):
2107 self.check_encode(1252, (
2108 ('abc', 'strict', b'abc'),
2109 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2110 ('\xff', 'strict', b'\xff'),
2111 ('\u0141', 'strict', None),
2112 ('\u0141', 'ignore', b''),
2113 ('\u0141', 'replace', b'L'),
2114 ))
2115 self.check_decode(1252, (
2116 (b'abc', 'strict', 'abc'),
2117 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2118 (b'\xff', 'strict', '\xff'),
2119 ))
2120
2121 def test_cp_utf7(self):
2122 cp = 65000
2123 self.check_encode(cp, (
2124 ('abc', 'strict', b'abc'),
2125 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2126 ('\U0010ffff', 'strict', b'+2//f/w-'),
2127 ('\udc80', 'strict', b'+3IA-'),
2128 ('\ufffd', 'strict', b'+//0-'),
2129 ))
2130 self.check_decode(cp, (
2131 (b'abc', 'strict', 'abc'),
2132 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2133 (b'+2//f/w-', 'strict', '\U0010ffff'),
2134 (b'+3IA-', 'strict', '\udc80'),
2135 (b'+//0-', 'strict', '\ufffd'),
2136 # invalid bytes
2137 (b'[+/]', 'strict', '[]'),
2138 (b'[\xff]', 'strict', '[\xff]'),
2139 ))
2140
Victor Stinner3a50e702011-10-18 21:21:00 +02002141 def test_multibyte_encoding(self):
2142 self.check_decode(932, (
2143 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2144 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2145 ))
2146 self.check_decode(self.CP_UTF8, (
2147 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2148 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2149 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002150 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002151 self.check_encode(self.CP_UTF8, (
2152 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2153 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2154 ))
2155
2156 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002157 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2158 self.assertEqual(decoded, ('', 0))
2159
Victor Stinner3a50e702011-10-18 21:21:00 +02002160 decoded = codecs.code_page_decode(932,
2161 b'\xe9\x80\xe9', 'strict',
2162 False)
2163 self.assertEqual(decoded, ('\u9a3e', 2))
2164
2165 decoded = codecs.code_page_decode(932,
2166 b'\xe9\x80\xe9\x80', 'strict',
2167 False)
2168 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2169
2170 decoded = codecs.code_page_decode(932,
2171 b'abc', 'strict',
2172 False)
2173 self.assertEqual(decoded, ('abc', 3))
2174
2175
Fred Drake2e2be372001-09-20 21:33:42 +00002176def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002177 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002178 UTF32Test,
2179 UTF32LETest,
2180 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002181 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002182 UTF16LETest,
2183 UTF16BETest,
2184 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002185 UTF8SigTest,
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002186 CP65001Test,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002187 UTF7Test,
2188 UTF16ExTest,
2189 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002190 RecodingTest,
2191 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002192 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002193 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002194 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002195 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002196 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002197 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002198 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002199 CharmapTest,
2200 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002201 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002202 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002203 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002204 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02002205 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002206 )
Fred Drake2e2be372001-09-20 21:33:42 +00002207
2208
2209if __name__ == "__main__":
2210 test_main()