blob: f2a1ae3f790f2ef54bd7902fb5f87a7fdfeb5bfb [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
77class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Walter Dörwald41980ca2007-08-16 21:55:45 +0000297class UTF32Test(ReadTest):
298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
333 "\x00\xff\u0100\uffff",
334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
355 ]
356 )
357
Georg Brandl791f4e12009-09-17 11:41:24 +0000358 def test_handlers(self):
359 self.assertEqual(('\ufffd', 1),
360 codecs.utf_32_decode(b'\x01', 'replace', True))
361 self.assertEqual(('', 1),
362 codecs.utf_32_decode(b'\x01', 'ignore', True))
363
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 def test_errors(self):
365 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
366 b"\xff", "strict", True)
367
368 def test_decoder_state(self):
369 self.check_state_handling_decode(self.encoding,
370 "spamspam", self.spamle)
371 self.check_state_handling_decode(self.encoding,
372 "spamspam", self.spambe)
373
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000374 def test_issue8941(self):
375 # Issue #8941: insufficient result allocation when decoding into
376 # surrogate pairs on UCS-2 builds.
377 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
378 self.assertEqual('\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_le)[0])
380 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
381 self.assertEqual('\U00010000' * 1024,
382 codecs.utf_32_decode(encoded_be)[0])
383
Walter Dörwald41980ca2007-08-16 21:55:45 +0000384class UTF32LETest(ReadTest):
385 encoding = "utf-32-le"
386
387 def test_partial(self):
388 self.check_partial(
389 "\x00\xff\u0100\uffff",
390 [
391 "",
392 "",
393 "",
394 "\x00",
395 "\x00",
396 "\x00",
397 "\x00",
398 "\x00\xff",
399 "\x00\xff",
400 "\x00\xff",
401 "\x00\xff",
402 "\x00\xff\u0100",
403 "\x00\xff\u0100",
404 "\x00\xff\u0100",
405 "\x00\xff\u0100",
406 "\x00\xff\u0100\uffff",
407 ]
408 )
409
410 def test_simple(self):
411 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
412
413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
415 b"\xff", "strict", True)
416
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000417 def test_issue8941(self):
418 # Issue #8941: insufficient result allocation when decoding into
419 # surrogate pairs on UCS-2 builds.
420 encoded = b'\x00\x00\x01\x00' * 1024
421 self.assertEqual('\U00010000' * 1024,
422 codecs.utf_32_le_decode(encoded)[0])
423
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424class UTF32BETest(ReadTest):
425 encoding = "utf-32-be"
426
427 def test_partial(self):
428 self.check_partial(
429 "\x00\xff\u0100\uffff",
430 [
431 "",
432 "",
433 "",
434 "\x00",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00\xff",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100\uffff",
447 ]
448 )
449
450 def test_simple(self):
451 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
452
453 def test_errors(self):
454 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
455 b"\xff", "strict", True)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded = b'\x00\x01\x00\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_be_decode(encoded)[0])
463
464
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000465class UTF16Test(ReadTest):
466 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000468 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
469 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470
471 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200475 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 f.write("spam")
477 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 d = s.getvalue()
479 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000480 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200483 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000484 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000486 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000493 self.assertRaises(UnicodeError, f.read)
494
Walter Dörwald69652032004-09-07 20:24:22 +0000495 def test_partial(self):
496 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000497 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000498 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000499 "", # first byte of BOM read
500 "", # second byte of BOM read => byteorder known
501 "",
502 "\x00",
503 "\x00",
504 "\x00\xff",
505 "\x00\xff",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000509 ]
510 )
511
Georg Brandl791f4e12009-09-17 11:41:24 +0000512 def test_handlers(self):
513 self.assertEqual(('\ufffd', 1),
514 codecs.utf_16_decode(b'\x01', 'replace', True))
515 self.assertEqual(('', 1),
516 codecs.utf_16_decode(b'\x01', 'ignore', True))
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000520 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000521
522 def test_decoder_state(self):
523 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000524 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000525 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000528 def test_bug691291(self):
529 # Files are always opened in binary mode, even if no binary mode was
530 # specified. This means that no automatic conversion of '\n' is done
531 # on reading and writing.
532 s1 = 'Hello\r\nworld\r\n'
533
534 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200535 self.addCleanup(support.unlink, support.TESTFN)
536 with open(support.TESTFN, 'wb') as fp:
537 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200538 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200539 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000540
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000541class UTF16LETest(ReadTest):
542 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000543
544 def test_partial(self):
545 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000547 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 "",
549 "\x00",
550 "\x00",
551 "\x00\xff",
552 "\x00\xff",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200560 tests = [
561 (b'\xff', '\ufffd'),
562 (b'A\x00Z', 'A\ufffd'),
563 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
564 (b'\x00\xd8', '\ufffd'),
565 (b'\x00\xd8A', '\ufffd'),
566 (b'\x00\xd8A\x00', '\ufffdA'),
567 (b'\x00\xdcA\x00', '\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Victor Stinner53a9dd72010-12-08 22:25:45 +0000574 def test_nonbmp(self):
575 self.assertEqual("\U00010203".encode(self.encoding),
576 b'\x00\xd8\x03\xde')
577 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
578 "\U00010203")
579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 "",
588 "\x00",
589 "\x00",
590 "\x00\xff",
591 "\x00\xff",
592 "\x00\xff\u0100",
593 "\x00\xff\u0100",
594 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000595 ]
596 )
597
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200599 tests = [
600 (b'\xff', '\ufffd'),
601 (b'\x00A\xff', 'A\ufffd'),
602 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
603 (b'\xd8\x00', '\ufffd'),
604 (b'\xd8\x00\xdc', '\ufffd'),
605 (b'\xd8\x00\x00A', '\ufffdA'),
606 (b'\xdc\x00\x00A', '\ufffdA'),
607 ]
608 for raw, expected in tests:
609 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
610 raw, 'strict', True)
611 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000612
Victor Stinner53a9dd72010-12-08 22:25:45 +0000613 def test_nonbmp(self):
614 self.assertEqual("\U00010203".encode(self.encoding),
615 b'\xd8\x00\xde\x03')
616 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
617 "\U00010203")
618
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000619class UTF8Test(ReadTest):
620 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000621
622 def test_partial(self):
623 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000625 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000626 "\x00",
627 "\x00",
628 "\x00\xff",
629 "\x00\xff",
630 "\x00\xff\u07ff",
631 "\x00\xff\u07ff",
632 "\x00\xff\u07ff",
633 "\x00\xff\u07ff\u0800",
634 "\x00\xff\u07ff\u0800",
635 "\x00\xff\u07ff\u0800",
636 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000637 ]
638 )
639
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000642 self.check_state_handling_decode(self.encoding,
643 u, u.encode(self.encoding))
644
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000645 def test_lone_surrogates(self):
646 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
647 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000648 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
649 b'[\\udc80]')
650 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
651 b'[&#56448;]')
652 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
653 b'[\x80]')
654 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
655 b'[]')
656 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
657 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000658
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000659 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000660 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
661 b"abc\xed\xa0\x80def")
662 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
663 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
665 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
666 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
667 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000668 self.assertTrue(codecs.lookup_error("surrogatepass"))
Philip Jenvey45c41492012-10-26 17:01:53 -0700669 with self.assertRaises(UnicodeDecodeError):
670 b"abc\xed\xa0".decode("utf-8", "surrogatepass")
Ezio Melotti540da762012-11-03 23:03:39 +0200671 with self.assertRaises(UnicodeDecodeError):
672 b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000673
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200674@unittest.skipUnless(sys.platform == 'win32',
675 'cp65001 is a Windows-only codec')
676class CP65001Test(ReadTest):
677 encoding = "cp65001"
678
679 def test_encode(self):
680 tests = [
681 ('abc', 'strict', b'abc'),
682 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
683 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
684 ]
685 if VISTA_OR_LATER:
686 tests.extend((
687 ('\udc80', 'strict', None),
688 ('\udc80', 'ignore', b''),
689 ('\udc80', 'replace', b'?'),
690 ('\udc80', 'backslashreplace', b'\\udc80'),
691 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
692 ))
693 else:
694 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
695 for text, errors, expected in tests:
696 if expected is not None:
697 try:
698 encoded = text.encode('cp65001', errors)
699 except UnicodeEncodeError as err:
700 self.fail('Unable to encode %a to cp65001 with '
701 'errors=%r: %s' % (text, errors, err))
702 self.assertEqual(encoded, expected,
703 '%a.encode("cp65001", %r)=%a != %a'
704 % (text, errors, encoded, expected))
705 else:
706 self.assertRaises(UnicodeEncodeError,
707 text.encode, "cp65001", errors)
708
709 def test_decode(self):
710 tests = [
711 (b'abc', 'strict', 'abc'),
712 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
713 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
714 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
715 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
716 # invalid bytes
717 (b'[\xff]', 'strict', None),
718 (b'[\xff]', 'ignore', '[]'),
719 (b'[\xff]', 'replace', '[\ufffd]'),
720 (b'[\xff]', 'surrogateescape', '[\udcff]'),
721 ]
722 if VISTA_OR_LATER:
723 tests.extend((
724 (b'[\xed\xb2\x80]', 'strict', None),
725 (b'[\xed\xb2\x80]', 'ignore', '[]'),
726 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
727 ))
728 else:
729 tests.extend((
730 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
731 ))
732 for raw, errors, expected in tests:
733 if expected is not None:
734 try:
735 decoded = raw.decode('cp65001', errors)
736 except UnicodeDecodeError as err:
737 self.fail('Unable to decode %a from cp65001 with '
738 'errors=%r: %s' % (raw, errors, err))
739 self.assertEqual(decoded, expected,
740 '%a.decode("cp65001", %r)=%a != %a'
741 % (raw, errors, decoded, expected))
742 else:
743 self.assertRaises(UnicodeDecodeError,
744 raw.decode, 'cp65001', errors)
745
746 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
747 def test_lone_surrogates(self):
748 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
749 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
750 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
751 b'[\\udc80]')
752 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
753 b'[&#56448;]')
754 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
755 b'[\x80]')
756 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
757 b'[]')
758 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
759 b'[?]')
760
761 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
762 def test_surrogatepass_handler(self):
763 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
764 b"abc\xed\xa0\x80def")
765 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
766 "abc\ud800def")
767 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
768 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
769 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
770 "\U00010fff\uD800")
771 self.assertTrue(codecs.lookup_error("surrogatepass"))
772
773
774
Walter Dörwalde22d3392005-11-17 08:52:34 +0000775class UTF7Test(ReadTest):
776 encoding = "utf-7"
777
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000778 def test_partial(self):
779 self.check_partial(
780 "a+-b",
781 [
782 "a",
783 "a",
784 "a+",
785 "a+-",
786 "a+-b",
787 ]
788 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000789
790class UTF16ExTest(unittest.TestCase):
791
792 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000793 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000794
795 def test_bad_args(self):
796 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
797
798class ReadBufferTest(unittest.TestCase):
799
800 def test_array(self):
801 import array
802 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000803 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000804 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000805 )
806
807 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000808 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000809
810 def test_bad_args(self):
811 self.assertRaises(TypeError, codecs.readbuffer_encode)
812 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
813
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000814class UTF8SigTest(ReadTest):
815 encoding = "utf-8-sig"
816
817 def test_partial(self):
818 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000819 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000820 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000821 "",
822 "",
823 "", # First BOM has been read and skipped
824 "",
825 "",
826 "\ufeff", # Second BOM has been read and emitted
827 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000828 "\ufeff\x00", # First byte of encoded "\xff" read
829 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
830 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
831 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000832 "\ufeff\x00\xff\u07ff",
833 "\ufeff\x00\xff\u07ff",
834 "\ufeff\x00\xff\u07ff\u0800",
835 "\ufeff\x00\xff\u07ff\u0800",
836 "\ufeff\x00\xff\u07ff\u0800",
837 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000838 ]
839 )
840
Thomas Wouters89f507f2006-12-13 04:49:30 +0000841 def test_bug1601501(self):
842 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000843 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000844
Walter Dörwald3abcb012007-04-16 22:10:50 +0000845 def test_bom(self):
846 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000847 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000848 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
849
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000850 def test_stream_bom(self):
851 unistring = "ABC\u00A1\u2200XYZ"
852 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
853
854 reader = codecs.getreader("utf-8-sig")
855 for sizehint in [None] + list(range(1, 11)) + \
856 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200857 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000858 ostream = io.StringIO()
859 while 1:
860 if sizehint is not None:
861 data = istream.read(sizehint)
862 else:
863 data = istream.read()
864
865 if not data:
866 break
867 ostream.write(data)
868
869 got = ostream.getvalue()
870 self.assertEqual(got, unistring)
871
872 def test_stream_bare(self):
873 unistring = "ABC\u00A1\u2200XYZ"
874 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
875
876 reader = codecs.getreader("utf-8-sig")
877 for sizehint in [None] + list(range(1, 11)) + \
878 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200879 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000880 ostream = io.StringIO()
881 while 1:
882 if sizehint is not None:
883 data = istream.read(sizehint)
884 else:
885 data = istream.read()
886
887 if not data:
888 break
889 ostream.write(data)
890
891 got = ostream.getvalue()
892 self.assertEqual(got, unistring)
893
894class EscapeDecodeTest(unittest.TestCase):
895 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000896 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000897
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000898class RecodingTest(unittest.TestCase):
899 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000900 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200901 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000902 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000903 f2.close()
904 # Python used to crash on this at exit because of a refcount
905 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000906
Martin v. Löwis2548c732003-04-18 10:39:54 +0000907# From RFC 3492
908punycode_testcases = [
909 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000910 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
911 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000912 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000913 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000914 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000915 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000916 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000917 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000918 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000919 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000920 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
921 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
922 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000923 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000924 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000925 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
926 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
927 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000928 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000929 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000930 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000931 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
932 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
933 "\u0939\u0948\u0902",
934 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000935
936 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000937 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000938 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
939 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000940
941 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000942 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
943 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
944 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000945 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
946 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000947
948 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000949 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
950 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
951 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
952 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000953 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000954
955 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000956 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
957 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
958 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
959 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
960 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000961 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000962
963 # (K) Vietnamese:
964 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
965 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000966 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
967 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
968 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
969 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000970 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000971
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000973 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000974 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000975
Martin v. Löwis2548c732003-04-18 10:39:54 +0000976 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000977 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
978 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
979 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000980 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981
982 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000983 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
984 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
985 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000986 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987
988 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000989 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000990 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991
992 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000993 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
994 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000995 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996
997 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000999 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000
1001 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001002 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001003 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001004
1005 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001006 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1007 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001008 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 ]
1010
1011for i in punycode_testcases:
1012 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001013 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001014
1015class PunycodeTest(unittest.TestCase):
1016 def test_encode(self):
1017 for uni, puny in punycode_testcases:
1018 # Need to convert both strings to lower case, since
1019 # some of the extended encodings use upper case, but our
1020 # code produces only lower case. Converting just puny to
1021 # lower is also insufficient, since some of the input characters
1022 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001023 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001024 str(uni.encode("punycode"), "ascii").lower(),
1025 str(puny, "ascii").lower()
1026 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027
1028 def test_decode(self):
1029 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001030 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001031 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001032 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001033
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001034class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001035 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001036 def test_bug1251300(self):
1037 # Decoding with unicode_internal used to not correctly handle "code
1038 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001039 ok = [
1040 (b"\x00\x10\xff\xff", "\U0010ffff"),
1041 (b"\x00\x00\x01\x01", "\U00000101"),
1042 (b"", ""),
1043 ]
1044 not_ok = [
1045 b"\x7f\xff\xff\xff",
1046 b"\x80\x00\x00\x00",
1047 b"\x81\x00\x00\x00",
1048 b"\x00",
1049 b"\x00\x00\x00\x00\x00",
1050 ]
1051 for internal, uni in ok:
1052 if sys.byteorder == "little":
1053 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001054 with support.check_warnings():
1055 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001056 for internal in not_ok:
1057 if sys.byteorder == "little":
1058 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001059 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001060 'deprecated', DeprecationWarning)):
1061 self.assertRaises(UnicodeDecodeError, internal.decode,
1062 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001063 if sys.byteorder == "little":
1064 invalid = b"\x00\x00\x11\x00"
1065 else:
1066 invalid = b"\x00\x11\x00\x00"
1067 with support.check_warnings():
1068 self.assertRaises(UnicodeDecodeError,
1069 invalid.decode, "unicode_internal")
1070 with support.check_warnings():
1071 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1072 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001073
Victor Stinner182d90d2011-09-29 19:53:55 +02001074 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001075 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001076 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001077 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001078 'deprecated', DeprecationWarning)):
1079 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001080 except UnicodeDecodeError as ex:
1081 self.assertEqual("unicode_internal", ex.encoding)
1082 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1083 self.assertEqual(4, ex.start)
1084 self.assertEqual(8, ex.end)
1085 else:
1086 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001087
Victor Stinner182d90d2011-09-29 19:53:55 +02001088 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001089 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001090 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1091 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001092 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001093 'deprecated', DeprecationWarning)):
1094 ab = "ab".encode("unicode_internal").decode()
1095 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1096 "ascii"),
1097 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001098 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001099
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001100 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001101 with support.check_warnings(('unicode_internal codec has been '
1102 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001103 # Issue 3739
1104 encoder = codecs.getencoder("unicode_internal")
1105 self.assertEqual(encoder("a")[1], 1)
1106 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1107
1108 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001109
Martin v. Löwis2548c732003-04-18 10:39:54 +00001110# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1111nameprep_tests = [
1112 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001113 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1114 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1115 b'\xb8\x8f\xef\xbb\xbf',
1116 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001117 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001118 (b'CAFE',
1119 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120 # 3.3 Case folding 8bit U+00DF (german sharp s).
1121 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001122 (b'\xc3\x9f',
1123 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001124 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001125 (b'\xc4\xb0',
1126 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001127 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001128 (b'\xc5\x83\xcd\xba',
1129 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001130 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1131 # XXX: skip this as it fails in UCS-2 mode
1132 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1133 # 'telc\xe2\x88\x95kg\xcf\x83'),
1134 (None, None),
1135 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001136 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1137 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001138 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001139 (b'\xe1\xbe\xb7',
1140 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141 # 3.9 Self-reverting case folding U+01F0 and normalization.
1142 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001143 (b'\xc7\xb0',
1144 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001145 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001146 (b'\xce\x90',
1147 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001148 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001149 (b'\xce\xb0',
1150 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001151 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001152 (b'\xe1\xba\x96',
1153 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001155 (b'\xe1\xbd\x96',
1156 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001158 (b' ',
1159 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001160 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001161 (b'\xc2\xa0',
1162 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001164 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001165 None),
1166 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001167 (b'\xe2\x80\x80',
1168 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001170 (b'\xe2\x80\x8b',
1171 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001173 (b'\xe3\x80\x80',
1174 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001176 (b'\x10\x7f',
1177 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001179 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001180 None),
1181 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001182 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183 None),
1184 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001185 (b'\xef\xbb\xbf',
1186 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001188 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189 None),
1190 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001191 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001192 None),
1193 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001194 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001195 None),
1196 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001197 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001198 None),
1199 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001200 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001201 None),
1202 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001203 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001204 None),
1205 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001206 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001207 None),
1208 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001209 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001210 None),
1211 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001212 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 None),
1214 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001215 (b'\xcd\x81',
1216 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001217 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001218 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001219 None),
1220 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001221 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001222 None),
1223 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001224 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001225 None),
1226 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001227 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001228 None),
1229 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001230 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001231 None),
1232 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001233 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234 None),
1235 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001236 (b'foo\xef\xb9\xb6bar',
1237 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001238 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001239 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240 None),
1241 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001242 (b'\xd8\xa71\xd8\xa8',
1243 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001244 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001245 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001246 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001247 # None),
1248 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001249 # 3.44 Larger test (shrinking).
1250 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001251 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1252 b'\xaa\xce\xb0\xe2\x80\x80',
1253 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001254 # 3.45 Larger test (expanding).
1255 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001256 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1257 b'\x80',
1258 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1259 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1260 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001261 ]
1262
1263
1264class NameprepTest(unittest.TestCase):
1265 def test_nameprep(self):
1266 from encodings.idna import nameprep
1267 for pos, (orig, prepped) in enumerate(nameprep_tests):
1268 if orig is None:
1269 # Skipped
1270 continue
1271 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001272 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001273 if prepped is None:
1274 # Input contains prohibited characters
1275 self.assertRaises(UnicodeError, nameprep, orig)
1276 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001277 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001279 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001280 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001281 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001282
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001283class IDNACodecTest(unittest.TestCase):
1284 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001285 self.assertEqual(str(b"python.org", "idna"), "python.org")
1286 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1287 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1288 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001289
1290 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001291 self.assertEqual("python.org".encode("idna"), b"python.org")
1292 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1293 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1294 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001295
Martin v. Löwis8b595142005-08-25 11:03:38 +00001296 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001297 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001298 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001299 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001300
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001301 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001302 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001303 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304 "python.org"
1305 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001306 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001307 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001308 "python.org."
1309 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001310 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001311 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001312 "pyth\xf6n.org."
1313 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001314 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001315 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001316 "pyth\xf6n.org."
1317 )
1318
1319 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001320 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1321 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1322 self.assertEqual(decoder.decode(b"rg"), "")
1323 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001324
1325 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001326 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1327 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1328 self.assertEqual(decoder.decode(b"rg."), "org.")
1329 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001330
1331 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001332 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 b"".join(codecs.iterencode("python.org", "idna")),
1334 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001335 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001336 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001337 b"".join(codecs.iterencode("python.org.", "idna")),
1338 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001339 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001340 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001341 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1342 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001343 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001344 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001345 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1346 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001347 )
1348
1349 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001350 self.assertEqual(encoder.encode("\xe4x"), b"")
1351 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1352 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001353
1354 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001355 self.assertEqual(encoder.encode("\xe4x"), b"")
1356 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1357 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001358
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001359class CodecsModuleTest(unittest.TestCase):
1360
1361 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001362 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1363 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001364 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001365 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001366 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001367
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001368 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001369 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1370 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001371 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001372 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001373 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001374 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001375
1376 def test_register(self):
1377 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001378 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001379
1380 def test_lookup(self):
1381 self.assertRaises(TypeError, codecs.lookup)
1382 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001383 self.assertRaises(LookupError, codecs.lookup, " ")
1384
1385 def test_getencoder(self):
1386 self.assertRaises(TypeError, codecs.getencoder)
1387 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1388
1389 def test_getdecoder(self):
1390 self.assertRaises(TypeError, codecs.getdecoder)
1391 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1392
1393 def test_getreader(self):
1394 self.assertRaises(TypeError, codecs.getreader)
1395 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1396
1397 def test_getwriter(self):
1398 self.assertRaises(TypeError, codecs.getwriter)
1399 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001400
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001401 def test_lookup_issue1813(self):
1402 # Issue #1813: under Turkish locales, lookup of some codecs failed
1403 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001404 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001405 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1406 try:
1407 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1408 except locale.Error:
1409 # Unsupported locale on this system
1410 self.skipTest('test needs Turkish locale')
1411 c = codecs.lookup('ASCII')
1412 self.assertEqual(c.name, 'ascii')
1413
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001414class StreamReaderTest(unittest.TestCase):
1415
1416 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001417 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001418 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001419
1420 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001421 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001422 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001423
Thomas Wouters89f507f2006-12-13 04:49:30 +00001424class EncodedFileTest(unittest.TestCase):
1425
1426 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001427 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001428 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001429 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001430
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001431 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001432 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001433 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001434 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001435
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001436all_unicode_encodings = [
1437 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001438 "big5",
1439 "big5hkscs",
1440 "charmap",
1441 "cp037",
1442 "cp1006",
1443 "cp1026",
1444 "cp1140",
1445 "cp1250",
1446 "cp1251",
1447 "cp1252",
1448 "cp1253",
1449 "cp1254",
1450 "cp1255",
1451 "cp1256",
1452 "cp1257",
1453 "cp1258",
1454 "cp424",
1455 "cp437",
1456 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001457 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001458 "cp737",
1459 "cp775",
1460 "cp850",
1461 "cp852",
1462 "cp855",
1463 "cp856",
1464 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001465 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001466 "cp860",
1467 "cp861",
1468 "cp862",
1469 "cp863",
1470 "cp864",
1471 "cp865",
1472 "cp866",
1473 "cp869",
1474 "cp874",
1475 "cp875",
1476 "cp932",
1477 "cp949",
1478 "cp950",
1479 "euc_jis_2004",
1480 "euc_jisx0213",
1481 "euc_jp",
1482 "euc_kr",
1483 "gb18030",
1484 "gb2312",
1485 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001486 "hp_roman8",
1487 "hz",
1488 "idna",
1489 "iso2022_jp",
1490 "iso2022_jp_1",
1491 "iso2022_jp_2",
1492 "iso2022_jp_2004",
1493 "iso2022_jp_3",
1494 "iso2022_jp_ext",
1495 "iso2022_kr",
1496 "iso8859_1",
1497 "iso8859_10",
1498 "iso8859_11",
1499 "iso8859_13",
1500 "iso8859_14",
1501 "iso8859_15",
1502 "iso8859_16",
1503 "iso8859_2",
1504 "iso8859_3",
1505 "iso8859_4",
1506 "iso8859_5",
1507 "iso8859_6",
1508 "iso8859_7",
1509 "iso8859_8",
1510 "iso8859_9",
1511 "johab",
1512 "koi8_r",
1513 "koi8_u",
1514 "latin_1",
1515 "mac_cyrillic",
1516 "mac_greek",
1517 "mac_iceland",
1518 "mac_latin2",
1519 "mac_roman",
1520 "mac_turkish",
1521 "palmos",
1522 "ptcp154",
1523 "punycode",
1524 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001525 "shift_jis",
1526 "shift_jis_2004",
1527 "shift_jisx0213",
1528 "tis_620",
1529 "unicode_escape",
1530 "unicode_internal",
1531 "utf_16",
1532 "utf_16_be",
1533 "utf_16_le",
1534 "utf_7",
1535 "utf_8",
1536]
1537
1538if hasattr(codecs, "mbcs_encode"):
1539 all_unicode_encodings.append("mbcs")
1540
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001541# The following encoding is not tested, because it's not supposed
1542# to work:
1543# "undefined"
1544
1545# The following encodings don't work in stateful mode
1546broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001547 "punycode",
1548 "unicode_internal"
1549]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001550broken_incremental_coders = broken_unicode_with_streams + [
1551 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001552]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001553
Walter Dörwald3abcb012007-04-16 22:10:50 +00001554class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001555 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001557 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001558 name = codecs.lookup(encoding).name
1559 if encoding.endswith("_codec"):
1560 name += "_codec"
1561 elif encoding == "latin_1":
1562 name = "latin_1"
1563 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001564
Ezio Melottiadc417c2011-11-17 12:23:34 +02001565 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001566 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001567 (b, size) = codecs.getencoder(encoding)(s)
1568 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1569 (chars, size) = codecs.getdecoder(encoding)(b)
1570 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001571
1572 if encoding not in broken_unicode_with_streams:
1573 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001574 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001575 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001576 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001577 for c in s:
1578 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001579 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001580 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001581 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001582 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001583 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001584 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001585 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001586 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001587 decodedresult += reader.read()
1588 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1589
Thomas Wouters89f507f2006-12-13 04:49:30 +00001590 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591 # check incremental decoder/encoder (fetched via the Python
1592 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001593 try:
1594 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001595 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001596 except LookupError: # no IncrementalEncoder
1597 pass
1598 else:
1599 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001600 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001601 for c in s:
1602 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001603 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001604 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001605 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001606 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001607 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001608 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1610
1611 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001612 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001613 for c in s:
1614 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001615 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001616 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001617 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001618 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001619 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001620 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001621 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1622
1623 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001624 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001625 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1626
1627 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001628 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1629 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001630
Victor Stinner554f3f02010-06-16 23:33:54 +00001631 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001632 # check incremental decoder/encoder with errors argument
1633 try:
1634 encoder = codecs.getincrementalencoder(encoding)("ignore")
1635 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1636 except LookupError: # no IncrementalEncoder
1637 pass
1638 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001639 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001640 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001641 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001642 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1643
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001644 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001645 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001646 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001647 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1648
Walter Dörwald729c31f2005-03-14 19:06:30 +00001649 def test_seek(self):
1650 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001651 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001652 for encoding in all_unicode_encodings:
1653 if encoding == "idna": # FIXME: See SF bug #1163178
1654 continue
1655 if encoding in broken_unicode_with_streams:
1656 continue
Victor Stinner05010702011-05-27 16:50:40 +02001657 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001658 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001659 # Test that calling seek resets the internal codec state and buffers
1660 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001661 data = reader.read()
1662 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001663
Walter Dörwalde22d3392005-11-17 08:52:34 +00001664 def test_bad_decode_args(self):
1665 for encoding in all_unicode_encodings:
1666 decoder = codecs.getdecoder(encoding)
1667 self.assertRaises(TypeError, decoder)
1668 if encoding not in ("idna", "punycode"):
1669 self.assertRaises(TypeError, decoder, 42)
1670
1671 def test_bad_encode_args(self):
1672 for encoding in all_unicode_encodings:
1673 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001674 with support.check_warnings():
1675 # unicode-internal has been deprecated
1676 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001677
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001678 def test_encoding_map_type_initialized(self):
1679 from encodings import cp1140
1680 # This used to crash, we are only verifying there's no crash.
1681 table_type = type(cp1140.encoding_table)
1682 self.assertEqual(table_type, table_type)
1683
Walter Dörwald3abcb012007-04-16 22:10:50 +00001684 def test_decoder_state(self):
1685 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001686 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001687 for encoding in all_unicode_encodings:
1688 if encoding not in broken_incremental_coders:
1689 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1690 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1691
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001692class CharmapTest(unittest.TestCase):
1693 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001694 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001695 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001696 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001697 )
1698
Ezio Melottib3aedd42010-11-20 19:04:17 +00001699 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001700 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1701 ("\U0010FFFFbc", 3)
1702 )
1703
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001704 self.assertRaises(UnicodeDecodeError,
1705 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1706 )
1707
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001708 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001709 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001710 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001711 )
1712
Ezio Melottib3aedd42010-11-20 19:04:17 +00001713 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001714 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001715 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001716 )
1717
Ezio Melottib3aedd42010-11-20 19:04:17 +00001718 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001719 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001720 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001721 )
1722
Ezio Melottib3aedd42010-11-20 19:04:17 +00001723 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001724 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001725 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001726 )
1727
Guido van Rossum805365e2007-05-07 22:24:25 +00001728 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001729 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001730 codecs.charmap_decode(allbytes, "ignore", ""),
1731 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001732 )
1733
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001734 def test_decode_with_int2str_map(self):
1735 self.assertEqual(
1736 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1737 {0: 'a', 1: 'b', 2: 'c'}),
1738 ("abc", 3)
1739 )
1740
1741 self.assertEqual(
1742 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1743 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1744 ("AaBbCc", 3)
1745 )
1746
1747 self.assertEqual(
1748 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1749 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1750 ("\U0010FFFFbc", 3)
1751 )
1752
1753 self.assertEqual(
1754 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1755 {0: 'a', 1: 'b', 2: ''}),
1756 ("ab", 3)
1757 )
1758
1759 self.assertRaises(UnicodeDecodeError,
1760 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1761 {0: 'a', 1: 'b'}
1762 )
1763
1764 self.assertEqual(
1765 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1766 {0: 'a', 1: 'b'}),
1767 ("ab\ufffd", 3)
1768 )
1769
1770 self.assertEqual(
1771 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1772 {0: 'a', 1: 'b', 2: None}),
1773 ("ab\ufffd", 3)
1774 )
1775
1776 self.assertEqual(
1777 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1778 {0: 'a', 1: 'b'}),
1779 ("ab", 3)
1780 )
1781
1782 self.assertEqual(
1783 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1784 {0: 'a', 1: 'b', 2: None}),
1785 ("ab", 3)
1786 )
1787
1788 allbytes = bytes(range(256))
1789 self.assertEqual(
1790 codecs.charmap_decode(allbytes, "ignore", {}),
1791 ("", len(allbytes))
1792 )
1793
1794 def test_decode_with_int2int_map(self):
1795 a = ord('a')
1796 b = ord('b')
1797 c = ord('c')
1798
1799 self.assertEqual(
1800 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1801 {0: a, 1: b, 2: c}),
1802 ("abc", 3)
1803 )
1804
1805 # Issue #15379
1806 self.assertEqual(
1807 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1808 {0: 0x10FFFF, 1: b, 2: c}),
1809 ("\U0010FFFFbc", 3)
1810 )
1811
Antoine Pitroua1f76552012-09-23 20:00:04 +02001812 self.assertEqual(
1813 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1814 {0: sys.maxunicode, 1: b, 2: c}),
1815 (chr(sys.maxunicode) + "bc", 3)
1816 )
1817
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001818 self.assertRaises(TypeError,
1819 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001820 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001821 )
1822
1823 self.assertRaises(UnicodeDecodeError,
1824 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1825 {0: a, 1: b},
1826 )
1827
1828 self.assertEqual(
1829 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1830 {0: a, 1: b}),
1831 ("ab\ufffd", 3)
1832 )
1833
1834 self.assertEqual(
1835 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1836 {0: a, 1: b}),
1837 ("ab", 3)
1838 )
1839
1840
Thomas Wouters89f507f2006-12-13 04:49:30 +00001841class WithStmtTest(unittest.TestCase):
1842 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001843 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001844 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1845 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001846
1847 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001848 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001849 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001850 with codecs.StreamReaderWriter(f, info.streamreader,
1851 info.streamwriter, 'strict') as srw:
1852 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001853
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001854class TypesTest(unittest.TestCase):
1855 def test_decode_unicode(self):
1856 # Most decoders don't accept unicode input
1857 decoders = [
1858 codecs.utf_7_decode,
1859 codecs.utf_8_decode,
1860 codecs.utf_16_le_decode,
1861 codecs.utf_16_be_decode,
1862 codecs.utf_16_ex_decode,
1863 codecs.utf_32_decode,
1864 codecs.utf_32_le_decode,
1865 codecs.utf_32_be_decode,
1866 codecs.utf_32_ex_decode,
1867 codecs.latin_1_decode,
1868 codecs.ascii_decode,
1869 codecs.charmap_decode,
1870 ]
1871 if hasattr(codecs, "mbcs_decode"):
1872 decoders.append(codecs.mbcs_decode)
1873 for decoder in decoders:
1874 self.assertRaises(TypeError, decoder, "xxx")
1875
1876 def test_unicode_escape(self):
1877 # Escape-decoding an unicode string is supported ang gives the same
1878 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001879 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1880 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1881 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1882 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001883
Victor Stinnere3b47152011-12-09 20:49:49 +01001884 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
1885 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1886
1887 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
1888 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1889
Martin v. Löwis43c57782009-05-10 08:15:24 +00001890class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001891
1892 def test_utf8(self):
1893 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001894 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001895 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001896 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001897 b"foo\x80bar")
1898 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001899 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001900 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001901 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001902 b"\xed\xb0\x80")
1903
1904 def test_ascii(self):
1905 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001906 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001907 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001908 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001909 b"foo\x80bar")
1910
1911 def test_charmap(self):
1912 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001913 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001914 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001915 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916 b"foo\xa5bar")
1917
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001918 def test_latin1(self):
1919 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001920 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001921 b"\xe4\xeb\xef\xf6\xfc")
1922
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001923
Victor Stinner3fed0872010-05-22 02:16:27 +00001924class BomTest(unittest.TestCase):
1925 def test_seek0(self):
1926 data = "1234567890"
1927 tests = ("utf-16",
1928 "utf-16-le",
1929 "utf-16-be",
1930 "utf-32",
1931 "utf-32-le",
1932 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001933 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001934 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001935 # Check if the BOM is written only once
1936 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001937 f.write(data)
1938 f.write(data)
1939 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001940 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001941 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001942 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001943
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001944 # Check that the BOM is written after a seek(0)
1945 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1946 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001947 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001948 f.seek(0)
1949 f.write(data)
1950 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001951 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001952
1953 # (StreamWriter) Check that the BOM is written after a seek(0)
1954 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001955 f.writer.write(data[0])
1956 self.assertNotEqual(f.writer.tell(), 0)
1957 f.writer.seek(0)
1958 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001959 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001960 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001961
Victor Stinner05010702011-05-27 16:50:40 +02001962 # Check that the BOM is not written after a seek() at a position
1963 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001964 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1965 f.write(data)
1966 f.seek(f.tell())
1967 f.write(data)
1968 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001969 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001970
Victor Stinner05010702011-05-27 16:50:40 +02001971 # (StreamWriter) Check that the BOM is not written after a seek()
1972 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001973 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001974 f.writer.write(data)
1975 f.writer.seek(f.writer.tell())
1976 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001977 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001978 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001979
Victor Stinner3fed0872010-05-22 02:16:27 +00001980
Georg Brandl02524622010-12-02 18:06:51 +00001981bytes_transform_encodings = [
1982 "base64_codec",
1983 "uu_codec",
1984 "quopri_codec",
1985 "hex_codec",
1986]
1987try:
1988 import zlib
1989except ImportError:
1990 pass
1991else:
1992 bytes_transform_encodings.append("zlib_codec")
1993try:
1994 import bz2
1995except ImportError:
1996 pass
1997else:
1998 bytes_transform_encodings.append("bz2_codec")
1999
2000class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002001
Georg Brandl02524622010-12-02 18:06:51 +00002002 def test_basics(self):
2003 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002004 for encoding in bytes_transform_encodings:
2005 # generic codecs interface
2006 (o, size) = codecs.getencoder(encoding)(binput)
2007 self.assertEqual(size, len(binput))
2008 (i, size) = codecs.getdecoder(encoding)(o)
2009 self.assertEqual(size, len(o))
2010 self.assertEqual(i, binput)
2011
Georg Brandl02524622010-12-02 18:06:51 +00002012 def test_read(self):
2013 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002014 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002015 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002016 sout = reader.read()
2017 self.assertEqual(sout, b"\x80")
2018
2019 def test_readline(self):
2020 for encoding in bytes_transform_encodings:
2021 if encoding in ['uu_codec', 'zlib_codec']:
2022 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002023 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002024 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002025 sout = reader.readline()
2026 self.assertEqual(sout, b"\x80")
2027
2028
Victor Stinner62be4fb2011-10-18 21:46:37 +02002029@unittest.skipUnless(sys.platform == 'win32',
2030 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002031class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002032 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002033 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002034
Victor Stinner3a50e702011-10-18 21:21:00 +02002035 def test_invalid_code_page(self):
2036 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2037 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2038 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2039 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2040
2041 def test_code_page_name(self):
2042 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2043 codecs.code_page_encode, 932, '\xff')
2044 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2045 codecs.code_page_decode, 932, b'\x81\x00')
2046 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2047 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2048
2049 def check_decode(self, cp, tests):
2050 for raw, errors, expected in tests:
2051 if expected is not None:
2052 try:
2053 decoded = codecs.code_page_decode(cp, raw, errors)
2054 except UnicodeDecodeError as err:
2055 self.fail('Unable to decode %a from "cp%s" with '
2056 'errors=%r: %s' % (raw, cp, errors, err))
2057 self.assertEqual(decoded[0], expected,
2058 '%a.decode("cp%s", %r)=%a != %a'
2059 % (raw, cp, errors, decoded[0], expected))
2060 # assert 0 <= decoded[1] <= len(raw)
2061 self.assertGreaterEqual(decoded[1], 0)
2062 self.assertLessEqual(decoded[1], len(raw))
2063 else:
2064 self.assertRaises(UnicodeDecodeError,
2065 codecs.code_page_decode, cp, raw, errors)
2066
2067 def check_encode(self, cp, tests):
2068 for text, errors, expected in tests:
2069 if expected is not None:
2070 try:
2071 encoded = codecs.code_page_encode(cp, text, errors)
2072 except UnicodeEncodeError as err:
2073 self.fail('Unable to encode %a to "cp%s" with '
2074 'errors=%r: %s' % (text, cp, errors, err))
2075 self.assertEqual(encoded[0], expected,
2076 '%a.encode("cp%s", %r)=%a != %a'
2077 % (text, cp, errors, encoded[0], expected))
2078 self.assertEqual(encoded[1], len(text))
2079 else:
2080 self.assertRaises(UnicodeEncodeError,
2081 codecs.code_page_encode, cp, text, errors)
2082
2083 def test_cp932(self):
2084 self.check_encode(932, (
2085 ('abc', 'strict', b'abc'),
2086 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002087 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002088 ('\xff', 'strict', None),
2089 ('[\xff]', 'ignore', b'[]'),
2090 ('[\xff]', 'replace', b'[y]'),
2091 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002092 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2093 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002094 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002095 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002096 (b'abc', 'strict', 'abc'),
2097 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2098 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002099 (b'[\xff]', 'strict', None),
2100 (b'[\xff]', 'ignore', '[]'),
2101 (b'[\xff]', 'replace', '[\ufffd]'),
2102 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002103 (b'\x81\x00abc', 'strict', None),
2104 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002105 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2106 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002107
2108 def test_cp1252(self):
2109 self.check_encode(1252, (
2110 ('abc', 'strict', b'abc'),
2111 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2112 ('\xff', 'strict', b'\xff'),
2113 ('\u0141', 'strict', None),
2114 ('\u0141', 'ignore', b''),
2115 ('\u0141', 'replace', b'L'),
2116 ))
2117 self.check_decode(1252, (
2118 (b'abc', 'strict', 'abc'),
2119 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2120 (b'\xff', 'strict', '\xff'),
2121 ))
2122
2123 def test_cp_utf7(self):
2124 cp = 65000
2125 self.check_encode(cp, (
2126 ('abc', 'strict', b'abc'),
2127 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2128 ('\U0010ffff', 'strict', b'+2//f/w-'),
2129 ('\udc80', 'strict', b'+3IA-'),
2130 ('\ufffd', 'strict', b'+//0-'),
2131 ))
2132 self.check_decode(cp, (
2133 (b'abc', 'strict', 'abc'),
2134 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2135 (b'+2//f/w-', 'strict', '\U0010ffff'),
2136 (b'+3IA-', 'strict', '\udc80'),
2137 (b'+//0-', 'strict', '\ufffd'),
2138 # invalid bytes
2139 (b'[+/]', 'strict', '[]'),
2140 (b'[\xff]', 'strict', '[\xff]'),
2141 ))
2142
Victor Stinner3a50e702011-10-18 21:21:00 +02002143 def test_multibyte_encoding(self):
2144 self.check_decode(932, (
2145 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2146 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2147 ))
2148 self.check_decode(self.CP_UTF8, (
2149 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2150 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2151 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002152 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002153 self.check_encode(self.CP_UTF8, (
2154 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2155 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2156 ))
2157
2158 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002159 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2160 self.assertEqual(decoded, ('', 0))
2161
Victor Stinner3a50e702011-10-18 21:21:00 +02002162 decoded = codecs.code_page_decode(932,
2163 b'\xe9\x80\xe9', 'strict',
2164 False)
2165 self.assertEqual(decoded, ('\u9a3e', 2))
2166
2167 decoded = codecs.code_page_decode(932,
2168 b'\xe9\x80\xe9\x80', 'strict',
2169 False)
2170 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2171
2172 decoded = codecs.code_page_decode(932,
2173 b'abc', 'strict',
2174 False)
2175 self.assertEqual(decoded, ('abc', 3))
2176
2177
Fred Drake2e2be372001-09-20 21:33:42 +00002178def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002179 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002180 UTF32Test,
2181 UTF32LETest,
2182 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002183 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002184 UTF16LETest,
2185 UTF16BETest,
2186 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002187 UTF8SigTest,
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002188 CP65001Test,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002189 UTF7Test,
2190 UTF16ExTest,
2191 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002192 RecodingTest,
2193 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002194 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002195 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002196 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002197 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002198 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002199 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002200 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002201 CharmapTest,
2202 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002203 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002204 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002205 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002206 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02002207 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002208 )
Fred Drake2e2be372001-09-20 21:33:42 +00002209
2210
2211if __name__ == "__main__":
2212 test_main()