blob: 4e808ec6acd0288d4cf8dded53b815906bba1c94 [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
77class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Walter Dörwald41980ca2007-08-16 21:55:45 +0000297class UTF32Test(ReadTest):
298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
333 "\x00\xff\u0100\uffff",
334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
355 ]
356 )
357
Georg Brandl791f4e12009-09-17 11:41:24 +0000358 def test_handlers(self):
359 self.assertEqual(('\ufffd', 1),
360 codecs.utf_32_decode(b'\x01', 'replace', True))
361 self.assertEqual(('', 1),
362 codecs.utf_32_decode(b'\x01', 'ignore', True))
363
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 def test_errors(self):
365 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
366 b"\xff", "strict", True)
367
368 def test_decoder_state(self):
369 self.check_state_handling_decode(self.encoding,
370 "spamspam", self.spamle)
371 self.check_state_handling_decode(self.encoding,
372 "spamspam", self.spambe)
373
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000374 def test_issue8941(self):
375 # Issue #8941: insufficient result allocation when decoding into
376 # surrogate pairs on UCS-2 builds.
377 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
378 self.assertEqual('\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_le)[0])
380 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
381 self.assertEqual('\U00010000' * 1024,
382 codecs.utf_32_decode(encoded_be)[0])
383
Walter Dörwald41980ca2007-08-16 21:55:45 +0000384class UTF32LETest(ReadTest):
385 encoding = "utf-32-le"
386
387 def test_partial(self):
388 self.check_partial(
389 "\x00\xff\u0100\uffff",
390 [
391 "",
392 "",
393 "",
394 "\x00",
395 "\x00",
396 "\x00",
397 "\x00",
398 "\x00\xff",
399 "\x00\xff",
400 "\x00\xff",
401 "\x00\xff",
402 "\x00\xff\u0100",
403 "\x00\xff\u0100",
404 "\x00\xff\u0100",
405 "\x00\xff\u0100",
406 "\x00\xff\u0100\uffff",
407 ]
408 )
409
410 def test_simple(self):
411 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
412
413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
415 b"\xff", "strict", True)
416
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000417 def test_issue8941(self):
418 # Issue #8941: insufficient result allocation when decoding into
419 # surrogate pairs on UCS-2 builds.
420 encoded = b'\x00\x00\x01\x00' * 1024
421 self.assertEqual('\U00010000' * 1024,
422 codecs.utf_32_le_decode(encoded)[0])
423
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424class UTF32BETest(ReadTest):
425 encoding = "utf-32-be"
426
427 def test_partial(self):
428 self.check_partial(
429 "\x00\xff\u0100\uffff",
430 [
431 "",
432 "",
433 "",
434 "\x00",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00\xff",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100\uffff",
447 ]
448 )
449
450 def test_simple(self):
451 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
452
453 def test_errors(self):
454 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
455 b"\xff", "strict", True)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded = b'\x00\x01\x00\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_be_decode(encoded)[0])
463
464
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000465class UTF16Test(ReadTest):
466 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000468 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
469 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470
471 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200475 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 f.write("spam")
477 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 d = s.getvalue()
479 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000480 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200483 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000484 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000486 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000493 self.assertRaises(UnicodeError, f.read)
494
Walter Dörwald69652032004-09-07 20:24:22 +0000495 def test_partial(self):
496 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000497 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000498 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000499 "", # first byte of BOM read
500 "", # second byte of BOM read => byteorder known
501 "",
502 "\x00",
503 "\x00",
504 "\x00\xff",
505 "\x00\xff",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000509 ]
510 )
511
Georg Brandl791f4e12009-09-17 11:41:24 +0000512 def test_handlers(self):
513 self.assertEqual(('\ufffd', 1),
514 codecs.utf_16_decode(b'\x01', 'replace', True))
515 self.assertEqual(('', 1),
516 codecs.utf_16_decode(b'\x01', 'ignore', True))
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000520 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000521
522 def test_decoder_state(self):
523 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000524 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000525 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000528 def test_bug691291(self):
529 # Files are always opened in binary mode, even if no binary mode was
530 # specified. This means that no automatic conversion of '\n' is done
531 # on reading and writing.
532 s1 = 'Hello\r\nworld\r\n'
533
534 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200535 self.addCleanup(support.unlink, support.TESTFN)
536 with open(support.TESTFN, 'wb') as fp:
537 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200538 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200539 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000540
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000541class UTF16LETest(ReadTest):
542 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000543
544 def test_partial(self):
545 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000547 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 "",
549 "\x00",
550 "\x00",
551 "\x00\xff",
552 "\x00\xff",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200560 tests = [
561 (b'\xff', '\ufffd'),
562 (b'A\x00Z', 'A\ufffd'),
563 (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
564 (b'\x00\xd8', '\ufffd'),
565 (b'\x00\xd8A', '\ufffd'),
566 (b'\x00\xd8A\x00', '\ufffdA'),
567 (b'\x00\xdcA\x00', '\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573
Victor Stinner53a9dd72010-12-08 22:25:45 +0000574 def test_nonbmp(self):
575 self.assertEqual("\U00010203".encode(self.encoding),
576 b'\x00\xd8\x03\xde')
577 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
578 "\U00010203")
579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000585 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000587 "",
588 "\x00",
589 "\x00",
590 "\x00\xff",
591 "\x00\xff",
592 "\x00\xff\u0100",
593 "\x00\xff\u0100",
594 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000595 ]
596 )
597
Walter Dörwalde22d3392005-11-17 08:52:34 +0000598 def test_errors(self):
Antoine Pitroub4bbee22012-07-21 00:45:14 +0200599 tests = [
600 (b'\xff', '\ufffd'),
601 (b'\x00A\xff', 'A\ufffd'),
602 (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
603 (b'\xd8\x00', '\ufffd'),
604 (b'\xd8\x00\xdc', '\ufffd'),
605 (b'\xd8\x00\x00A', '\ufffdA'),
606 (b'\xdc\x00\x00A', '\ufffdA'),
607 ]
608 for raw, expected in tests:
609 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
610 raw, 'strict', True)
611 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000612
Victor Stinner53a9dd72010-12-08 22:25:45 +0000613 def test_nonbmp(self):
614 self.assertEqual("\U00010203".encode(self.encoding),
615 b'\xd8\x00\xde\x03')
616 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
617 "\U00010203")
618
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000619class UTF8Test(ReadTest):
620 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000621
622 def test_partial(self):
623 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000624 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000625 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000626 "\x00",
627 "\x00",
628 "\x00\xff",
629 "\x00\xff",
630 "\x00\xff\u07ff",
631 "\x00\xff\u07ff",
632 "\x00\xff\u07ff",
633 "\x00\xff\u07ff\u0800",
634 "\x00\xff\u07ff\u0800",
635 "\x00\xff\u07ff\u0800",
636 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000637 ]
638 )
639
Walter Dörwald3abcb012007-04-16 22:10:50 +0000640 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000641 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000642 self.check_state_handling_decode(self.encoding,
643 u, u.encode(self.encoding))
644
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000645 def test_lone_surrogates(self):
646 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
647 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000648 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
649 b'[\\udc80]')
650 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
651 b'[&#56448;]')
652 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
653 b'[\x80]')
654 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
655 b'[]')
656 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
657 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000658
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000659 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000660 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
661 b"abc\xed\xa0\x80def")
662 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
663 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
665 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
666 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
667 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000668 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000669
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200670@unittest.skipUnless(sys.platform == 'win32',
671 'cp65001 is a Windows-only codec')
672class CP65001Test(ReadTest):
673 encoding = "cp65001"
674
675 def test_encode(self):
676 tests = [
677 ('abc', 'strict', b'abc'),
678 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
679 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
680 ]
681 if VISTA_OR_LATER:
682 tests.extend((
683 ('\udc80', 'strict', None),
684 ('\udc80', 'ignore', b''),
685 ('\udc80', 'replace', b'?'),
686 ('\udc80', 'backslashreplace', b'\\udc80'),
687 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
688 ))
689 else:
690 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
691 for text, errors, expected in tests:
692 if expected is not None:
693 try:
694 encoded = text.encode('cp65001', errors)
695 except UnicodeEncodeError as err:
696 self.fail('Unable to encode %a to cp65001 with '
697 'errors=%r: %s' % (text, errors, err))
698 self.assertEqual(encoded, expected,
699 '%a.encode("cp65001", %r)=%a != %a'
700 % (text, errors, encoded, expected))
701 else:
702 self.assertRaises(UnicodeEncodeError,
703 text.encode, "cp65001", errors)
704
705 def test_decode(self):
706 tests = [
707 (b'abc', 'strict', 'abc'),
708 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
709 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
710 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
711 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
712 # invalid bytes
713 (b'[\xff]', 'strict', None),
714 (b'[\xff]', 'ignore', '[]'),
715 (b'[\xff]', 'replace', '[\ufffd]'),
716 (b'[\xff]', 'surrogateescape', '[\udcff]'),
717 ]
718 if VISTA_OR_LATER:
719 tests.extend((
720 (b'[\xed\xb2\x80]', 'strict', None),
721 (b'[\xed\xb2\x80]', 'ignore', '[]'),
722 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
723 ))
724 else:
725 tests.extend((
726 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
727 ))
728 for raw, errors, expected in tests:
729 if expected is not None:
730 try:
731 decoded = raw.decode('cp65001', errors)
732 except UnicodeDecodeError as err:
733 self.fail('Unable to decode %a from cp65001 with '
734 'errors=%r: %s' % (raw, errors, err))
735 self.assertEqual(decoded, expected,
736 '%a.decode("cp65001", %r)=%a != %a'
737 % (raw, errors, decoded, expected))
738 else:
739 self.assertRaises(UnicodeDecodeError,
740 raw.decode, 'cp65001', errors)
741
742 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
743 def test_lone_surrogates(self):
744 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
745 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
746 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
747 b'[\\udc80]')
748 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
749 b'[&#56448;]')
750 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
751 b'[\x80]')
752 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
753 b'[]')
754 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
755 b'[?]')
756
757 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
758 def test_surrogatepass_handler(self):
759 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
760 b"abc\xed\xa0\x80def")
761 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
762 "abc\ud800def")
763 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
764 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
765 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
766 "\U00010fff\uD800")
767 self.assertTrue(codecs.lookup_error("surrogatepass"))
768
769
770
Walter Dörwalde22d3392005-11-17 08:52:34 +0000771class UTF7Test(ReadTest):
772 encoding = "utf-7"
773
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000774 def test_partial(self):
775 self.check_partial(
776 "a+-b",
777 [
778 "a",
779 "a",
780 "a+",
781 "a+-",
782 "a+-b",
783 ]
784 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000785
786class UTF16ExTest(unittest.TestCase):
787
788 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000789 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000790
791 def test_bad_args(self):
792 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
793
794class ReadBufferTest(unittest.TestCase):
795
796 def test_array(self):
797 import array
798 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000799 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000800 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000801 )
802
803 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000804 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000805
806 def test_bad_args(self):
807 self.assertRaises(TypeError, codecs.readbuffer_encode)
808 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
809
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000810class UTF8SigTest(ReadTest):
811 encoding = "utf-8-sig"
812
813 def test_partial(self):
814 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000815 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000816 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000817 "",
818 "",
819 "", # First BOM has been read and skipped
820 "",
821 "",
822 "\ufeff", # Second BOM has been read and emitted
823 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000824 "\ufeff\x00", # First byte of encoded "\xff" read
825 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
826 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
827 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000828 "\ufeff\x00\xff\u07ff",
829 "\ufeff\x00\xff\u07ff",
830 "\ufeff\x00\xff\u07ff\u0800",
831 "\ufeff\x00\xff\u07ff\u0800",
832 "\ufeff\x00\xff\u07ff\u0800",
833 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000834 ]
835 )
836
Thomas Wouters89f507f2006-12-13 04:49:30 +0000837 def test_bug1601501(self):
838 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000839 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000840
Walter Dörwald3abcb012007-04-16 22:10:50 +0000841 def test_bom(self):
842 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000843 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000844 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
845
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000846 def test_stream_bom(self):
847 unistring = "ABC\u00A1\u2200XYZ"
848 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
849
850 reader = codecs.getreader("utf-8-sig")
851 for sizehint in [None] + list(range(1, 11)) + \
852 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200853 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000854 ostream = io.StringIO()
855 while 1:
856 if sizehint is not None:
857 data = istream.read(sizehint)
858 else:
859 data = istream.read()
860
861 if not data:
862 break
863 ostream.write(data)
864
865 got = ostream.getvalue()
866 self.assertEqual(got, unistring)
867
868 def test_stream_bare(self):
869 unistring = "ABC\u00A1\u2200XYZ"
870 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
871
872 reader = codecs.getreader("utf-8-sig")
873 for sizehint in [None] + list(range(1, 11)) + \
874 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200875 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000876 ostream = io.StringIO()
877 while 1:
878 if sizehint is not None:
879 data = istream.read(sizehint)
880 else:
881 data = istream.read()
882
883 if not data:
884 break
885 ostream.write(data)
886
887 got = ostream.getvalue()
888 self.assertEqual(got, unistring)
889
890class EscapeDecodeTest(unittest.TestCase):
891 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000892 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000893
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000894class RecodingTest(unittest.TestCase):
895 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000896 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200897 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000898 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000899 f2.close()
900 # Python used to crash on this at exit because of a refcount
901 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000902
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903# From RFC 3492
904punycode_testcases = [
905 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000906 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
907 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000908 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000909 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000910 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000911 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000912 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000913 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000914 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000915 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000916 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
917 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
918 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000919 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000920 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000921 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
922 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
923 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000924 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000925 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000926 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000927 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
928 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
929 "\u0939\u0948\u0902",
930 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000931
932 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000933 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000934 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
935 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936
937 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000938 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
939 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
940 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000941 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
942 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000943
944 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000945 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
946 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
947 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
948 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000949 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950
951 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000952 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
953 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
954 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
955 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
956 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000957 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000958
959 # (K) Vietnamese:
960 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
961 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000962 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
963 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
964 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
965 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000966 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967
Martin v. Löwis2548c732003-04-18 10:39:54 +0000968 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000969 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000970 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000971
Martin v. Löwis2548c732003-04-18 10:39:54 +0000972 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000973 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
974 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
975 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000976 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000977
978 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000979 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
980 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
981 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000982 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983
984 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000985 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000986 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000987
988 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000989 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
990 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000991 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000992
993 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000994 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000995 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000996
997 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000998 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000999 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000
1001 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001002 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1003 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +00001004 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 ]
1006
1007for i in punycode_testcases:
1008 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001009 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001010
1011class PunycodeTest(unittest.TestCase):
1012 def test_encode(self):
1013 for uni, puny in punycode_testcases:
1014 # Need to convert both strings to lower case, since
1015 # some of the extended encodings use upper case, but our
1016 # code produces only lower case. Converting just puny to
1017 # lower is also insufficient, since some of the input characters
1018 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001019 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +00001020 str(uni.encode("punycode"), "ascii").lower(),
1021 str(puny, "ascii").lower()
1022 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023
1024 def test_decode(self):
1025 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001026 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001027 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001028 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001030class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001031 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001032 def test_bug1251300(self):
1033 # Decoding with unicode_internal used to not correctly handle "code
1034 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001035 ok = [
1036 (b"\x00\x10\xff\xff", "\U0010ffff"),
1037 (b"\x00\x00\x01\x01", "\U00000101"),
1038 (b"", ""),
1039 ]
1040 not_ok = [
1041 b"\x7f\xff\xff\xff",
1042 b"\x80\x00\x00\x00",
1043 b"\x81\x00\x00\x00",
1044 b"\x00",
1045 b"\x00\x00\x00\x00\x00",
1046 ]
1047 for internal, uni in ok:
1048 if sys.byteorder == "little":
1049 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001050 with support.check_warnings():
1051 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001052 for internal in not_ok:
1053 if sys.byteorder == "little":
1054 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001055 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001056 'deprecated', DeprecationWarning)):
1057 self.assertRaises(UnicodeDecodeError, internal.decode,
1058 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001059 if sys.byteorder == "little":
1060 invalid = b"\x00\x00\x11\x00"
1061 else:
1062 invalid = b"\x00\x11\x00\x00"
1063 with support.check_warnings():
1064 self.assertRaises(UnicodeDecodeError,
1065 invalid.decode, "unicode_internal")
1066 with support.check_warnings():
1067 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1068 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001069
Victor Stinner182d90d2011-09-29 19:53:55 +02001070 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001071 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001072 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001073 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001074 'deprecated', DeprecationWarning)):
1075 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001076 except UnicodeDecodeError as ex:
1077 self.assertEqual("unicode_internal", ex.encoding)
1078 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1079 self.assertEqual(4, ex.start)
1080 self.assertEqual(8, ex.end)
1081 else:
1082 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001083
Victor Stinner182d90d2011-09-29 19:53:55 +02001084 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001085 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001086 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1087 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001088 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001089 'deprecated', DeprecationWarning)):
1090 ab = "ab".encode("unicode_internal").decode()
1091 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1092 "ascii"),
1093 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001094 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001095
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001096 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001097 with support.check_warnings(('unicode_internal codec has been '
1098 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001099 # Issue 3739
1100 encoder = codecs.getencoder("unicode_internal")
1101 self.assertEqual(encoder("a")[1], 1)
1102 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1103
1104 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001105
Martin v. Löwis2548c732003-04-18 10:39:54 +00001106# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1107nameprep_tests = [
1108 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001109 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1110 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1111 b'\xb8\x8f\xef\xbb\xbf',
1112 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001114 (b'CAFE',
1115 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 # 3.3 Case folding 8bit U+00DF (german sharp s).
1117 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001118 (b'\xc3\x9f',
1119 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001121 (b'\xc4\xb0',
1122 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001124 (b'\xc5\x83\xcd\xba',
1125 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001126 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1127 # XXX: skip this as it fails in UCS-2 mode
1128 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1129 # 'telc\xe2\x88\x95kg\xcf\x83'),
1130 (None, None),
1131 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001132 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1133 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001134 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001135 (b'\xe1\xbe\xb7',
1136 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137 # 3.9 Self-reverting case folding U+01F0 and normalization.
1138 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001139 (b'\xc7\xb0',
1140 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001141 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 (b'\xce\x90',
1143 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001144 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001145 (b'\xce\xb0',
1146 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001147 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001148 (b'\xe1\xba\x96',
1149 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001150 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001151 (b'\xe1\xbd\x96',
1152 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001153 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001154 (b' ',
1155 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001156 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001157 (b'\xc2\xa0',
1158 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001159 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001160 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 None),
1162 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001163 (b'\xe2\x80\x80',
1164 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001165 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001166 (b'\xe2\x80\x8b',
1167 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001168 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001169 (b'\xe3\x80\x80',
1170 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001171 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001172 (b'\x10\x7f',
1173 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001174 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001175 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001176 None),
1177 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001179 None),
1180 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001181 (b'\xef\xbb\xbf',
1182 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001183 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001184 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 None),
1186 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001187 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001188 None),
1189 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191 None),
1192 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001193 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001194 None),
1195 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001196 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001197 None),
1198 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001199 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001200 None),
1201 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001202 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001203 None),
1204 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001205 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001206 None),
1207 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001208 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001209 None),
1210 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001211 (b'\xcd\x81',
1212 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001213 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001214 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001215 None),
1216 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001217 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218 None),
1219 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001220 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001221 None),
1222 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001223 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 None),
1225 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001226 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001227 None),
1228 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001229 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001230 None),
1231 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001232 (b'foo\xef\xb9\xb6bar',
1233 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001234 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001235 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001236 None),
1237 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001238 (b'\xd8\xa71\xd8\xa8',
1239 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001240 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001241 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001242 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001243 # None),
1244 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001245 # 3.44 Larger test (shrinking).
1246 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001247 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1248 b'\xaa\xce\xb0\xe2\x80\x80',
1249 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001250 # 3.45 Larger test (expanding).
1251 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001252 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1253 b'\x80',
1254 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1255 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1256 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001257 ]
1258
1259
1260class NameprepTest(unittest.TestCase):
1261 def test_nameprep(self):
1262 from encodings.idna import nameprep
1263 for pos, (orig, prepped) in enumerate(nameprep_tests):
1264 if orig is None:
1265 # Skipped
1266 continue
1267 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001268 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001269 if prepped is None:
1270 # Input contains prohibited characters
1271 self.assertRaises(UnicodeError, nameprep, orig)
1272 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001273 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001274 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001275 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001276 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001277 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001278
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001279class IDNACodecTest(unittest.TestCase):
1280 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001281 self.assertEqual(str(b"python.org", "idna"), "python.org")
1282 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1283 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1284 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001285
1286 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001287 self.assertEqual("python.org".encode("idna"), b"python.org")
1288 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1289 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1290 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001291
Martin v. Löwis8b595142005-08-25 11:03:38 +00001292 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001293 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001294 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001295 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001296
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001297 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001298 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001299 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001300 "python.org"
1301 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001302 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001303 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304 "python.org."
1305 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001306 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001307 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001308 "pyth\xf6n.org."
1309 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001310 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001311 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001312 "pyth\xf6n.org."
1313 )
1314
1315 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001316 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1317 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1318 self.assertEqual(decoder.decode(b"rg"), "")
1319 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001320
1321 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001322 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1323 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1324 self.assertEqual(decoder.decode(b"rg."), "org.")
1325 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001326
1327 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001328 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001329 b"".join(codecs.iterencode("python.org", "idna")),
1330 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001331 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001332 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001333 b"".join(codecs.iterencode("python.org.", "idna")),
1334 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001335 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001336 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001337 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1338 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001339 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001340 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001341 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1342 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001343 )
1344
1345 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001346 self.assertEqual(encoder.encode("\xe4x"), b"")
1347 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1348 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001349
1350 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001351 self.assertEqual(encoder.encode("\xe4x"), b"")
1352 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1353 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001354
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001355class CodecsModuleTest(unittest.TestCase):
1356
1357 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001358 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1359 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001360 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001361 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001362 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001363
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001364 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001365 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1366 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001367 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001368 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001369 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001370 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001371
1372 def test_register(self):
1373 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001374 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001375
1376 def test_lookup(self):
1377 self.assertRaises(TypeError, codecs.lookup)
1378 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001379 self.assertRaises(LookupError, codecs.lookup, " ")
1380
1381 def test_getencoder(self):
1382 self.assertRaises(TypeError, codecs.getencoder)
1383 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1384
1385 def test_getdecoder(self):
1386 self.assertRaises(TypeError, codecs.getdecoder)
1387 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1388
1389 def test_getreader(self):
1390 self.assertRaises(TypeError, codecs.getreader)
1391 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1392
1393 def test_getwriter(self):
1394 self.assertRaises(TypeError, codecs.getwriter)
1395 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001396
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001397 def test_lookup_issue1813(self):
1398 # Issue #1813: under Turkish locales, lookup of some codecs failed
1399 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001400 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001401 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1402 try:
1403 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1404 except locale.Error:
1405 # Unsupported locale on this system
1406 self.skipTest('test needs Turkish locale')
1407 c = codecs.lookup('ASCII')
1408 self.assertEqual(c.name, 'ascii')
1409
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001410class StreamReaderTest(unittest.TestCase):
1411
1412 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001413 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001414 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001415
1416 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001417 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001418 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001419
Thomas Wouters89f507f2006-12-13 04:49:30 +00001420class EncodedFileTest(unittest.TestCase):
1421
1422 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001423 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001424 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001425 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001426
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001427 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001428 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001429 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001430 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001431
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001432all_unicode_encodings = [
1433 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001434 "big5",
1435 "big5hkscs",
1436 "charmap",
1437 "cp037",
1438 "cp1006",
1439 "cp1026",
1440 "cp1140",
1441 "cp1250",
1442 "cp1251",
1443 "cp1252",
1444 "cp1253",
1445 "cp1254",
1446 "cp1255",
1447 "cp1256",
1448 "cp1257",
1449 "cp1258",
1450 "cp424",
1451 "cp437",
1452 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001453 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001454 "cp737",
1455 "cp775",
1456 "cp850",
1457 "cp852",
1458 "cp855",
1459 "cp856",
1460 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001461 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001462 "cp860",
1463 "cp861",
1464 "cp862",
1465 "cp863",
1466 "cp864",
1467 "cp865",
1468 "cp866",
1469 "cp869",
1470 "cp874",
1471 "cp875",
1472 "cp932",
1473 "cp949",
1474 "cp950",
1475 "euc_jis_2004",
1476 "euc_jisx0213",
1477 "euc_jp",
1478 "euc_kr",
1479 "gb18030",
1480 "gb2312",
1481 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001482 "hp_roman8",
1483 "hz",
1484 "idna",
1485 "iso2022_jp",
1486 "iso2022_jp_1",
1487 "iso2022_jp_2",
1488 "iso2022_jp_2004",
1489 "iso2022_jp_3",
1490 "iso2022_jp_ext",
1491 "iso2022_kr",
1492 "iso8859_1",
1493 "iso8859_10",
1494 "iso8859_11",
1495 "iso8859_13",
1496 "iso8859_14",
1497 "iso8859_15",
1498 "iso8859_16",
1499 "iso8859_2",
1500 "iso8859_3",
1501 "iso8859_4",
1502 "iso8859_5",
1503 "iso8859_6",
1504 "iso8859_7",
1505 "iso8859_8",
1506 "iso8859_9",
1507 "johab",
1508 "koi8_r",
1509 "koi8_u",
1510 "latin_1",
1511 "mac_cyrillic",
1512 "mac_greek",
1513 "mac_iceland",
1514 "mac_latin2",
1515 "mac_roman",
1516 "mac_turkish",
1517 "palmos",
1518 "ptcp154",
1519 "punycode",
1520 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001521 "shift_jis",
1522 "shift_jis_2004",
1523 "shift_jisx0213",
1524 "tis_620",
1525 "unicode_escape",
1526 "unicode_internal",
1527 "utf_16",
1528 "utf_16_be",
1529 "utf_16_le",
1530 "utf_7",
1531 "utf_8",
1532]
1533
1534if hasattr(codecs, "mbcs_encode"):
1535 all_unicode_encodings.append("mbcs")
1536
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001537# The following encoding is not tested, because it's not supposed
1538# to work:
1539# "undefined"
1540
1541# The following encodings don't work in stateful mode
1542broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001543 "punycode",
1544 "unicode_internal"
1545]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001546broken_incremental_coders = broken_unicode_with_streams + [
1547 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001548]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001549
Walter Dörwald3abcb012007-04-16 22:10:50 +00001550class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001551 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001552 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001553 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001554 name = codecs.lookup(encoding).name
1555 if encoding.endswith("_codec"):
1556 name += "_codec"
1557 elif encoding == "latin_1":
1558 name = "latin_1"
1559 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001560
Ezio Melottiadc417c2011-11-17 12:23:34 +02001561 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001562 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001563 (b, size) = codecs.getencoder(encoding)(s)
1564 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1565 (chars, size) = codecs.getdecoder(encoding)(b)
1566 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001567
1568 if encoding not in broken_unicode_with_streams:
1569 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001570 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001571 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001572 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001573 for c in s:
1574 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001575 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001576 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001577 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001578 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001579 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001580 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001581 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001582 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001583 decodedresult += reader.read()
1584 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1585
Thomas Wouters89f507f2006-12-13 04:49:30 +00001586 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001587 # check incremental decoder/encoder (fetched via the Python
1588 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001589 try:
1590 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001591 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001592 except LookupError: # no IncrementalEncoder
1593 pass
1594 else:
1595 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001596 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001597 for c in s:
1598 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001599 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001600 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001601 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001602 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001603 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001604 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001605 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1606
1607 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001608 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001609 for c in s:
1610 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001611 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001612 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001613 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001614 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001615 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001616 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001617 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1618
1619 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001620 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001621 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1622
1623 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001624 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1625 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001626
Victor Stinner554f3f02010-06-16 23:33:54 +00001627 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001628 # check incremental decoder/encoder with errors argument
1629 try:
1630 encoder = codecs.getincrementalencoder(encoding)("ignore")
1631 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1632 except LookupError: # no IncrementalEncoder
1633 pass
1634 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001635 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001636 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001637 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001638 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1639
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001640 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001641 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001642 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001643 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1644
Walter Dörwald729c31f2005-03-14 19:06:30 +00001645 def test_seek(self):
1646 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001647 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001648 for encoding in all_unicode_encodings:
1649 if encoding == "idna": # FIXME: See SF bug #1163178
1650 continue
1651 if encoding in broken_unicode_with_streams:
1652 continue
Victor Stinner05010702011-05-27 16:50:40 +02001653 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001654 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001655 # Test that calling seek resets the internal codec state and buffers
1656 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001657 data = reader.read()
1658 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001659
Walter Dörwalde22d3392005-11-17 08:52:34 +00001660 def test_bad_decode_args(self):
1661 for encoding in all_unicode_encodings:
1662 decoder = codecs.getdecoder(encoding)
1663 self.assertRaises(TypeError, decoder)
1664 if encoding not in ("idna", "punycode"):
1665 self.assertRaises(TypeError, decoder, 42)
1666
1667 def test_bad_encode_args(self):
1668 for encoding in all_unicode_encodings:
1669 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001670 with support.check_warnings():
1671 # unicode-internal has been deprecated
1672 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001673
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001674 def test_encoding_map_type_initialized(self):
1675 from encodings import cp1140
1676 # This used to crash, we are only verifying there's no crash.
1677 table_type = type(cp1140.encoding_table)
1678 self.assertEqual(table_type, table_type)
1679
Walter Dörwald3abcb012007-04-16 22:10:50 +00001680 def test_decoder_state(self):
1681 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001682 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001683 for encoding in all_unicode_encodings:
1684 if encoding not in broken_incremental_coders:
1685 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1686 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1687
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001688class CharmapTest(unittest.TestCase):
1689 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001690 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001691 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001692 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001693 )
1694
Ezio Melottib3aedd42010-11-20 19:04:17 +00001695 self.assertEqual(
Antoine Pitroua1f76552012-09-23 20:00:04 +02001696 codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
1697 ("\U0010FFFFbc", 3)
1698 )
1699
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001700 self.assertRaises(UnicodeDecodeError,
1701 codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
1702 )
1703
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001704 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001705 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001706 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001707 )
1708
Ezio Melottib3aedd42010-11-20 19:04:17 +00001709 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001710 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001711 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001712 )
1713
Ezio Melottib3aedd42010-11-20 19:04:17 +00001714 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001715 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001716 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001717 )
1718
Ezio Melottib3aedd42010-11-20 19:04:17 +00001719 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001720 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001721 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001722 )
1723
Guido van Rossum805365e2007-05-07 22:24:25 +00001724 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001725 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001726 codecs.charmap_decode(allbytes, "ignore", ""),
1727 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001728 )
1729
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001730 def test_decode_with_int2str_map(self):
1731 self.assertEqual(
1732 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1733 {0: 'a', 1: 'b', 2: 'c'}),
1734 ("abc", 3)
1735 )
1736
1737 self.assertEqual(
1738 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1739 {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
1740 ("AaBbCc", 3)
1741 )
1742
1743 self.assertEqual(
1744 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1745 {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
1746 ("\U0010FFFFbc", 3)
1747 )
1748
1749 self.assertEqual(
1750 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1751 {0: 'a', 1: 'b', 2: ''}),
1752 ("ab", 3)
1753 )
1754
1755 self.assertRaises(UnicodeDecodeError,
1756 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1757 {0: 'a', 1: 'b'}
1758 )
1759
1760 self.assertEqual(
1761 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1762 {0: 'a', 1: 'b'}),
1763 ("ab\ufffd", 3)
1764 )
1765
1766 self.assertEqual(
1767 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1768 {0: 'a', 1: 'b', 2: None}),
1769 ("ab\ufffd", 3)
1770 )
1771
1772 self.assertEqual(
1773 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1774 {0: 'a', 1: 'b'}),
1775 ("ab", 3)
1776 )
1777
1778 self.assertEqual(
1779 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1780 {0: 'a', 1: 'b', 2: None}),
1781 ("ab", 3)
1782 )
1783
1784 allbytes = bytes(range(256))
1785 self.assertEqual(
1786 codecs.charmap_decode(allbytes, "ignore", {}),
1787 ("", len(allbytes))
1788 )
1789
1790 def test_decode_with_int2int_map(self):
1791 a = ord('a')
1792 b = ord('b')
1793 c = ord('c')
1794
1795 self.assertEqual(
1796 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1797 {0: a, 1: b, 2: c}),
1798 ("abc", 3)
1799 )
1800
1801 # Issue #15379
1802 self.assertEqual(
1803 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1804 {0: 0x10FFFF, 1: b, 2: c}),
1805 ("\U0010FFFFbc", 3)
1806 )
1807
Antoine Pitroua1f76552012-09-23 20:00:04 +02001808 self.assertEqual(
1809 codecs.charmap_decode(b"\x00\x01\x02", "strict",
1810 {0: sys.maxunicode, 1: b, 2: c}),
1811 (chr(sys.maxunicode) + "bc", 3)
1812 )
1813
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001814 self.assertRaises(TypeError,
1815 codecs.charmap_decode, b"\x00\x01\x02", "strict",
Antoine Pitroua1f76552012-09-23 20:00:04 +02001816 {0: sys.maxunicode + 1, 1: b, 2: c}
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02001817 )
1818
1819 self.assertRaises(UnicodeDecodeError,
1820 codecs.charmap_decode, b"\x00\x01\x02", "strict",
1821 {0: a, 1: b},
1822 )
1823
1824 self.assertEqual(
1825 codecs.charmap_decode(b"\x00\x01\x02", "replace",
1826 {0: a, 1: b}),
1827 ("ab\ufffd", 3)
1828 )
1829
1830 self.assertEqual(
1831 codecs.charmap_decode(b"\x00\x01\x02", "ignore",
1832 {0: a, 1: b}),
1833 ("ab", 3)
1834 )
1835
1836
Thomas Wouters89f507f2006-12-13 04:49:30 +00001837class WithStmtTest(unittest.TestCase):
1838 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001839 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001840 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1841 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001842
1843 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001844 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001845 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001846 with codecs.StreamReaderWriter(f, info.streamreader,
1847 info.streamwriter, 'strict') as srw:
1848 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001849
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001850class TypesTest(unittest.TestCase):
1851 def test_decode_unicode(self):
1852 # Most decoders don't accept unicode input
1853 decoders = [
1854 codecs.utf_7_decode,
1855 codecs.utf_8_decode,
1856 codecs.utf_16_le_decode,
1857 codecs.utf_16_be_decode,
1858 codecs.utf_16_ex_decode,
1859 codecs.utf_32_decode,
1860 codecs.utf_32_le_decode,
1861 codecs.utf_32_be_decode,
1862 codecs.utf_32_ex_decode,
1863 codecs.latin_1_decode,
1864 codecs.ascii_decode,
1865 codecs.charmap_decode,
1866 ]
1867 if hasattr(codecs, "mbcs_decode"):
1868 decoders.append(codecs.mbcs_decode)
1869 for decoder in decoders:
1870 self.assertRaises(TypeError, decoder, "xxx")
1871
1872 def test_unicode_escape(self):
1873 # Escape-decoding an unicode string is supported ang gives the same
1874 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001875 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1876 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1877 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1878 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001879
Victor Stinnere3b47152011-12-09 20:49:49 +01001880 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
1881 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1882
1883 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
1884 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1885
Martin v. Löwis43c57782009-05-10 08:15:24 +00001886class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001887
1888 def test_utf8(self):
1889 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001890 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001891 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001892 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001893 b"foo\x80bar")
1894 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001895 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001896 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001897 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001898 b"\xed\xb0\x80")
1899
1900 def test_ascii(self):
1901 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001902 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001903 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001904 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 b"foo\x80bar")
1906
1907 def test_charmap(self):
1908 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001909 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001910 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001911 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001912 b"foo\xa5bar")
1913
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001914 def test_latin1(self):
1915 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001916 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001917 b"\xe4\xeb\xef\xf6\xfc")
1918
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001919
Victor Stinner3fed0872010-05-22 02:16:27 +00001920class BomTest(unittest.TestCase):
1921 def test_seek0(self):
1922 data = "1234567890"
1923 tests = ("utf-16",
1924 "utf-16-le",
1925 "utf-16-be",
1926 "utf-32",
1927 "utf-32-le",
1928 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001929 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001930 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001931 # Check if the BOM is written only once
1932 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001933 f.write(data)
1934 f.write(data)
1935 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001936 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001937 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001938 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001939
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001940 # Check that the BOM is written after a seek(0)
1941 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1942 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001943 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001944 f.seek(0)
1945 f.write(data)
1946 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001947 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001948
1949 # (StreamWriter) Check that the BOM is written after a seek(0)
1950 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001951 f.writer.write(data[0])
1952 self.assertNotEqual(f.writer.tell(), 0)
1953 f.writer.seek(0)
1954 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001955 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001956 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001957
Victor Stinner05010702011-05-27 16:50:40 +02001958 # Check that the BOM is not written after a seek() at a position
1959 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001960 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1961 f.write(data)
1962 f.seek(f.tell())
1963 f.write(data)
1964 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001965 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001966
Victor Stinner05010702011-05-27 16:50:40 +02001967 # (StreamWriter) Check that the BOM is not written after a seek()
1968 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001969 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001970 f.writer.write(data)
1971 f.writer.seek(f.writer.tell())
1972 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001973 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001974 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001975
Victor Stinner3fed0872010-05-22 02:16:27 +00001976
Georg Brandl02524622010-12-02 18:06:51 +00001977bytes_transform_encodings = [
1978 "base64_codec",
1979 "uu_codec",
1980 "quopri_codec",
1981 "hex_codec",
1982]
1983try:
1984 import zlib
1985except ImportError:
1986 pass
1987else:
1988 bytes_transform_encodings.append("zlib_codec")
1989try:
1990 import bz2
1991except ImportError:
1992 pass
1993else:
1994 bytes_transform_encodings.append("bz2_codec")
1995
1996class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001997
Georg Brandl02524622010-12-02 18:06:51 +00001998 def test_basics(self):
1999 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00002000 for encoding in bytes_transform_encodings:
2001 # generic codecs interface
2002 (o, size) = codecs.getencoder(encoding)(binput)
2003 self.assertEqual(size, len(binput))
2004 (i, size) = codecs.getdecoder(encoding)(o)
2005 self.assertEqual(size, len(o))
2006 self.assertEqual(i, binput)
2007
Georg Brandl02524622010-12-02 18:06:51 +00002008 def test_read(self):
2009 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002010 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002011 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002012 sout = reader.read()
2013 self.assertEqual(sout, b"\x80")
2014
2015 def test_readline(self):
2016 for encoding in bytes_transform_encodings:
2017 if encoding in ['uu_codec', 'zlib_codec']:
2018 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00002019 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02002020 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00002021 sout = reader.readline()
2022 self.assertEqual(sout, b"\x80")
2023
2024
Victor Stinner62be4fb2011-10-18 21:46:37 +02002025@unittest.skipUnless(sys.platform == 'win32',
2026 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02002027class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002028 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02002029 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02002030
Victor Stinner3a50e702011-10-18 21:21:00 +02002031 def test_invalid_code_page(self):
2032 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
2033 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
2034 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
2035 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
2036
2037 def test_code_page_name(self):
2038 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
2039 codecs.code_page_encode, 932, '\xff')
2040 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
2041 codecs.code_page_decode, 932, b'\x81\x00')
2042 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
2043 codecs.code_page_decode, self.CP_UTF8, b'\xff')
2044
2045 def check_decode(self, cp, tests):
2046 for raw, errors, expected in tests:
2047 if expected is not None:
2048 try:
2049 decoded = codecs.code_page_decode(cp, raw, errors)
2050 except UnicodeDecodeError as err:
2051 self.fail('Unable to decode %a from "cp%s" with '
2052 'errors=%r: %s' % (raw, cp, errors, err))
2053 self.assertEqual(decoded[0], expected,
2054 '%a.decode("cp%s", %r)=%a != %a'
2055 % (raw, cp, errors, decoded[0], expected))
2056 # assert 0 <= decoded[1] <= len(raw)
2057 self.assertGreaterEqual(decoded[1], 0)
2058 self.assertLessEqual(decoded[1], len(raw))
2059 else:
2060 self.assertRaises(UnicodeDecodeError,
2061 codecs.code_page_decode, cp, raw, errors)
2062
2063 def check_encode(self, cp, tests):
2064 for text, errors, expected in tests:
2065 if expected is not None:
2066 try:
2067 encoded = codecs.code_page_encode(cp, text, errors)
2068 except UnicodeEncodeError as err:
2069 self.fail('Unable to encode %a to "cp%s" with '
2070 'errors=%r: %s' % (text, cp, errors, err))
2071 self.assertEqual(encoded[0], expected,
2072 '%a.encode("cp%s", %r)=%a != %a'
2073 % (text, cp, errors, encoded[0], expected))
2074 self.assertEqual(encoded[1], len(text))
2075 else:
2076 self.assertRaises(UnicodeEncodeError,
2077 codecs.code_page_encode, cp, text, errors)
2078
2079 def test_cp932(self):
2080 self.check_encode(932, (
2081 ('abc', 'strict', b'abc'),
2082 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002083 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02002084 ('\xff', 'strict', None),
2085 ('[\xff]', 'ignore', b'[]'),
2086 ('[\xff]', 'replace', b'[y]'),
2087 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002088 ('[\xff]', 'backslashreplace', b'[\\xff]'),
2089 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002090 ))
Victor Stinner9e921882011-10-18 21:55:25 +02002091 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02002092 (b'abc', 'strict', 'abc'),
2093 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
2094 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002095 (b'[\xff]', 'strict', None),
2096 (b'[\xff]', 'ignore', '[]'),
2097 (b'[\xff]', 'replace', '[\ufffd]'),
2098 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02002099 (b'\x81\x00abc', 'strict', None),
2100 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02002101 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
2102 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02002103
2104 def test_cp1252(self):
2105 self.check_encode(1252, (
2106 ('abc', 'strict', b'abc'),
2107 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
2108 ('\xff', 'strict', b'\xff'),
2109 ('\u0141', 'strict', None),
2110 ('\u0141', 'ignore', b''),
2111 ('\u0141', 'replace', b'L'),
2112 ))
2113 self.check_decode(1252, (
2114 (b'abc', 'strict', 'abc'),
2115 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
2116 (b'\xff', 'strict', '\xff'),
2117 ))
2118
2119 def test_cp_utf7(self):
2120 cp = 65000
2121 self.check_encode(cp, (
2122 ('abc', 'strict', b'abc'),
2123 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
2124 ('\U0010ffff', 'strict', b'+2//f/w-'),
2125 ('\udc80', 'strict', b'+3IA-'),
2126 ('\ufffd', 'strict', b'+//0-'),
2127 ))
2128 self.check_decode(cp, (
2129 (b'abc', 'strict', 'abc'),
2130 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
2131 (b'+2//f/w-', 'strict', '\U0010ffff'),
2132 (b'+3IA-', 'strict', '\udc80'),
2133 (b'+//0-', 'strict', '\ufffd'),
2134 # invalid bytes
2135 (b'[+/]', 'strict', '[]'),
2136 (b'[\xff]', 'strict', '[\xff]'),
2137 ))
2138
Victor Stinner3a50e702011-10-18 21:21:00 +02002139 def test_multibyte_encoding(self):
2140 self.check_decode(932, (
2141 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2142 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2143 ))
2144 self.check_decode(self.CP_UTF8, (
2145 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2146 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2147 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002148 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002149 self.check_encode(self.CP_UTF8, (
2150 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2151 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2152 ))
2153
2154 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002155 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2156 self.assertEqual(decoded, ('', 0))
2157
Victor Stinner3a50e702011-10-18 21:21:00 +02002158 decoded = codecs.code_page_decode(932,
2159 b'\xe9\x80\xe9', 'strict',
2160 False)
2161 self.assertEqual(decoded, ('\u9a3e', 2))
2162
2163 decoded = codecs.code_page_decode(932,
2164 b'\xe9\x80\xe9\x80', 'strict',
2165 False)
2166 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2167
2168 decoded = codecs.code_page_decode(932,
2169 b'abc', 'strict',
2170 False)
2171 self.assertEqual(decoded, ('abc', 3))
2172
2173
Fred Drake2e2be372001-09-20 21:33:42 +00002174def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002175 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002176 UTF32Test,
2177 UTF32LETest,
2178 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002179 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002180 UTF16LETest,
2181 UTF16BETest,
2182 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002183 UTF8SigTest,
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002184 CP65001Test,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002185 UTF7Test,
2186 UTF16ExTest,
2187 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002188 RecodingTest,
2189 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002190 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002191 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002192 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002193 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002194 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002195 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002196 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002197 CharmapTest,
2198 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002199 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002200 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002201 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002202 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02002203 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002204 )
Fred Drake2e2be372001-09-20 21:33:42 +00002205
2206
2207if __name__ == "__main__":
2208 test_main()