blob: 5daaa19fdeaa6f0ab38cb8a088dad8a72bb4608c [file] [log] [blame]
Victor Stinner040e16e2011-11-15 22:44:05 +01001import _testcapi
Victor Stinner05010702011-05-27 16:50:40 +02002import codecs
Victor Stinner040e16e2011-11-15 22:44:05 +01003import io
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner040e16e2011-11-15 22:44:05 +01005import sys
6import unittest
7import warnings
8
9from test import support
Victor Stinner182d90d2011-09-29 19:53:55 +020010
Victor Stinner2f3ca9f2011-10-27 01:38:56 +020011if sys.platform == 'win32':
12 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13else:
14 VISTA_OR_LATER = False
15
Antoine Pitrou00b2c862011-10-05 13:01:41 +020016try:
17 import ctypes
18except ImportError:
19 ctypes = None
20 SIZEOF_WCHAR_T = -1
21else:
22 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000023
Walter Dörwald69652032004-09-07 20:24:22 +000024class Queue(object):
25 """
26 queue: write bytes at one end, read bytes from the other end
27 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000028 def __init__(self, buffer):
29 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000030
31 def write(self, chars):
32 self._buffer += chars
33
34 def read(self, size=-1):
35 if size<0:
36 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000038 return s
39 else:
40 s = self._buffer[:size]
41 self._buffer = self._buffer[size:]
42 return s
43
Walter Dörwald3abcb012007-04-16 22:10:50 +000044class MixInCheckStateHandling:
45 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000046 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 d = codecs.getincrementaldecoder(encoding)()
48 part1 = d.decode(s[:i])
49 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000050 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000051 # Check that the condition stated in the documentation for
52 # IncrementalDecoder.getstate() holds
53 if not state[1]:
54 # reset decoder to the default state without anything buffered
55 d.setstate((state[0][:0], 0))
56 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000057 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000058 # The decoder must return to the same state
59 self.assertEqual(state, d.getstate())
60 # Create a new decoder and set it to the state
61 # we extracted from the old one
62 d = codecs.getincrementaldecoder(encoding)()
63 d.setstate(state)
64 part2 = d.decode(s[i:], True)
65 self.assertEqual(u, part1+part2)
66
67 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000068 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000069 d = codecs.getincrementalencoder(encoding)()
70 part1 = d.encode(u[:i])
71 state = d.getstate()
72 d = codecs.getincrementalencoder(encoding)()
73 d.setstate(state)
74 part2 = d.encode(u[i:], True)
75 self.assertEqual(s, part1+part2)
76
77class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000078 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000079 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000080 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000081 # the StreamReader and check that the results equal the appropriate
82 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020084 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000085 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000086 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000088 result += r.read()
89 self.assertEqual(result, partialresult)
90 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000091 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000092 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000093
Thomas Woutersa9773292006-04-21 09:43:23 +000094 # do the check again, this time using a incremental decoder
95 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000096 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000097 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000098 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000099 self.assertEqual(result, partialresult)
100 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000101 self.assertEqual(d.decode(b"", True), "")
102 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000103
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000106 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000108 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000109 self.assertEqual(result, partialresult)
110 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000111 self.assertEqual(d.decode(b"", True), "")
112 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
114 # check iterdecode()
115 encoded = input.encode(self.encoding)
116 self.assertEqual(
117 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000118 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000119 )
120
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121 def test_readline(self):
122 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000123 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 return codecs.getreader(self.encoding)(stream)
125
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200127 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000128 lines = []
129 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131 if not line:
132 break
133 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000134 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000135
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000136 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
137 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
138 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000139 self.assertEqual(readalllines(s, True), sexpected)
140 self.assertEqual(readalllines(s, False), sexpectednoends)
141 self.assertEqual(readalllines(s, True, 10), sexpected)
142 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000143
144 # Test long lines (multiple calls to read() in readline())
145 vw = []
146 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000147 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
148 vw.append((i*200)*"\3042" + lineend)
149 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000150 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
151 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
152
153 # Test lines where the first read might end with \r, so the
154 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000156 for lineend in "\n \r\n \r \u2028".split():
157 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000158 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000159 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000160 self.assertEqual(
161 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000162 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000163 )
164 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000165 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000166 self.assertEqual(
167 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000168 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000169 )
170
171 def test_bug1175396(self):
172 s = [
173 '<%!--===================================================\r\n',
174 ' BLOG index page: show recent articles,\r\n',
175 ' today\'s articles, or articles of a specific date.\r\n',
176 '========================================================--%>\r\n',
177 '<%@inputencoding="ISO-8859-1"%>\r\n',
178 '<%@pagetemplate=TEMPLATE.y%>\r\n',
179 '<%@import=import frog.util, frog%>\r\n',
180 '<%@import=import frog.objects%>\r\n',
181 '<%@import=from frog.storageerrors import StorageError%>\r\n',
182 '<%\r\n',
183 '\r\n',
184 'import logging\r\n',
185 'log=logging.getLogger("Snakelets.logger")\r\n',
186 '\r\n',
187 '\r\n',
188 'user=self.SessionCtx.user\r\n',
189 'storageEngine=self.SessionCtx.storageEngine\r\n',
190 '\r\n',
191 '\r\n',
192 'def readArticlesFromDate(date, count=None):\r\n',
193 ' entryids=storageEngine.listBlogEntries(date)\r\n',
194 ' entryids.reverse() # descending\r\n',
195 ' if count:\r\n',
196 ' entryids=entryids[:count]\r\n',
197 ' try:\r\n',
198 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
199 ' except StorageError,x:\r\n',
200 ' log.error("Error loading articles: "+str(x))\r\n',
201 ' self.abort("cannot load articles")\r\n',
202 '\r\n',
203 'showdate=None\r\n',
204 '\r\n',
205 'arg=self.Request.getArg()\r\n',
206 'if arg=="today":\r\n',
207 ' #-------------------- TODAY\'S ARTICLES\r\n',
208 ' self.write("<h2>Today\'s articles</h2>")\r\n',
209 ' showdate = frog.util.isodatestr() \r\n',
210 ' entries = readArticlesFromDate(showdate)\r\n',
211 'elif arg=="active":\r\n',
212 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
213 ' self.Yredirect("active.y")\r\n',
214 'elif arg=="login":\r\n',
215 ' #-------------------- LOGIN PAGE redirect\r\n',
216 ' self.Yredirect("login.y")\r\n',
217 'elif arg=="date":\r\n',
218 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
219 ' showdate = self.Request.getParameter("date")\r\n',
220 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
221 ' entries = readArticlesFromDate(showdate)\r\n',
222 'else:\r\n',
223 ' #-------------------- RECENT ARTICLES\r\n',
224 ' self.write("<h2>Recent articles</h2>")\r\n',
225 ' dates=storageEngine.listBlogEntryDates()\r\n',
226 ' if dates:\r\n',
227 ' entries=[]\r\n',
228 ' SHOWAMOUNT=10\r\n',
229 ' for showdate in dates:\r\n',
230 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
231 ' if len(entries)>=SHOWAMOUNT:\r\n',
232 ' break\r\n',
233 ' \r\n',
234 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000235 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200236 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000237 for (i, line) in enumerate(reader):
238 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000239
240 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000241 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200242 writer = codecs.getwriter(self.encoding)(q)
243 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000244
245 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000246 writer.write("foo\r")
247 self.assertEqual(reader.readline(keepends=False), "foo")
248 writer.write("\nbar\r")
249 self.assertEqual(reader.readline(keepends=False), "")
250 self.assertEqual(reader.readline(keepends=False), "bar")
251 writer.write("baz")
252 self.assertEqual(reader.readline(keepends=False), "baz")
253 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000254
255 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000256 writer.write("foo\r")
257 self.assertEqual(reader.readline(keepends=True), "foo\r")
258 writer.write("\nbar\r")
259 self.assertEqual(reader.readline(keepends=True), "\n")
260 self.assertEqual(reader.readline(keepends=True), "bar\r")
261 writer.write("baz")
262 self.assertEqual(reader.readline(keepends=True), "baz")
263 self.assertEqual(reader.readline(keepends=True), "")
264 writer.write("foo\r\n")
265 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000268 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
269 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
270 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271
272 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000273 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200274 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275 self.assertEqual(reader.readline(), s1)
276 self.assertEqual(reader.readline(), s2)
277 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000278 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000279
280 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
282 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
283 s3 = "stillokay:bbbbxx\r\n"
284 s4 = "broken!!!!badbad\r\n"
285 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286
287 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000288 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200289 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000290 self.assertEqual(reader.readline(), s1)
291 self.assertEqual(reader.readline(), s2)
292 self.assertEqual(reader.readline(), s3)
293 self.assertEqual(reader.readline(), s4)
294 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000295 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000296
Walter Dörwald41980ca2007-08-16 21:55:45 +0000297class UTF32Test(ReadTest):
298 encoding = "utf-32"
299
300 spamle = (b'\xff\xfe\x00\x00'
301 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
302 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
303 spambe = (b'\x00\x00\xfe\xff'
304 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
305 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
306
307 def test_only_one_bom(self):
308 _,_,reader,writer = codecs.lookup(self.encoding)
309 # encode some stream
310 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200311 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000312 f.write("spam")
313 f.write("spam")
314 d = s.getvalue()
315 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000316 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317 # try to read it back
318 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200319 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000320 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321
322 def test_badbom(self):
323 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200328 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000329 self.assertRaises(UnicodeError, f.read)
330
331 def test_partial(self):
332 self.check_partial(
333 "\x00\xff\u0100\uffff",
334 [
335 "", # first byte of BOM read
336 "", # second byte of BOM read
337 "", # third byte of BOM read
338 "", # fourth byte of BOM read => byteorder known
339 "",
340 "",
341 "",
342 "\x00",
343 "\x00",
344 "\x00",
345 "\x00",
346 "\x00\xff",
347 "\x00\xff",
348 "\x00\xff",
349 "\x00\xff",
350 "\x00\xff\u0100",
351 "\x00\xff\u0100",
352 "\x00\xff\u0100",
353 "\x00\xff\u0100",
354 "\x00\xff\u0100\uffff",
355 ]
356 )
357
Georg Brandl791f4e12009-09-17 11:41:24 +0000358 def test_handlers(self):
359 self.assertEqual(('\ufffd', 1),
360 codecs.utf_32_decode(b'\x01', 'replace', True))
361 self.assertEqual(('', 1),
362 codecs.utf_32_decode(b'\x01', 'ignore', True))
363
Walter Dörwald41980ca2007-08-16 21:55:45 +0000364 def test_errors(self):
365 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
366 b"\xff", "strict", True)
367
368 def test_decoder_state(self):
369 self.check_state_handling_decode(self.encoding,
370 "spamspam", self.spamle)
371 self.check_state_handling_decode(self.encoding,
372 "spamspam", self.spambe)
373
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000374 def test_issue8941(self):
375 # Issue #8941: insufficient result allocation when decoding into
376 # surrogate pairs on UCS-2 builds.
377 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
378 self.assertEqual('\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_le)[0])
380 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
381 self.assertEqual('\U00010000' * 1024,
382 codecs.utf_32_decode(encoded_be)[0])
383
Walter Dörwald41980ca2007-08-16 21:55:45 +0000384class UTF32LETest(ReadTest):
385 encoding = "utf-32-le"
386
387 def test_partial(self):
388 self.check_partial(
389 "\x00\xff\u0100\uffff",
390 [
391 "",
392 "",
393 "",
394 "\x00",
395 "\x00",
396 "\x00",
397 "\x00",
398 "\x00\xff",
399 "\x00\xff",
400 "\x00\xff",
401 "\x00\xff",
402 "\x00\xff\u0100",
403 "\x00\xff\u0100",
404 "\x00\xff\u0100",
405 "\x00\xff\u0100",
406 "\x00\xff\u0100\uffff",
407 ]
408 )
409
410 def test_simple(self):
411 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
412
413 def test_errors(self):
414 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
415 b"\xff", "strict", True)
416
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000417 def test_issue8941(self):
418 # Issue #8941: insufficient result allocation when decoding into
419 # surrogate pairs on UCS-2 builds.
420 encoded = b'\x00\x00\x01\x00' * 1024
421 self.assertEqual('\U00010000' * 1024,
422 codecs.utf_32_le_decode(encoded)[0])
423
Walter Dörwald41980ca2007-08-16 21:55:45 +0000424class UTF32BETest(ReadTest):
425 encoding = "utf-32-be"
426
427 def test_partial(self):
428 self.check_partial(
429 "\x00\xff\u0100\uffff",
430 [
431 "",
432 "",
433 "",
434 "\x00",
435 "\x00",
436 "\x00",
437 "\x00",
438 "\x00\xff",
439 "\x00\xff",
440 "\x00\xff",
441 "\x00\xff",
442 "\x00\xff\u0100",
443 "\x00\xff\u0100",
444 "\x00\xff\u0100",
445 "\x00\xff\u0100",
446 "\x00\xff\u0100\uffff",
447 ]
448 )
449
450 def test_simple(self):
451 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
452
453 def test_errors(self):
454 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
455 b"\xff", "strict", True)
456
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000457 def test_issue8941(self):
458 # Issue #8941: insufficient result allocation when decoding into
459 # surrogate pairs on UCS-2 builds.
460 encoded = b'\x00\x01\x00\x00' * 1024
461 self.assertEqual('\U00010000' * 1024,
462 codecs.utf_32_be_decode(encoded)[0])
463
464
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000465class UTF16Test(ReadTest):
466 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000468 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
469 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000470
471 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000473 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000474 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200475 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000476 f.write("spam")
477 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 d = s.getvalue()
479 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000480 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000482 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200483 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000484 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000485
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000486 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000491 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200492 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000493 self.assertRaises(UnicodeError, f.read)
494
Walter Dörwald69652032004-09-07 20:24:22 +0000495 def test_partial(self):
496 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000497 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000498 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000499 "", # first byte of BOM read
500 "", # second byte of BOM read => byteorder known
501 "",
502 "\x00",
503 "\x00",
504 "\x00\xff",
505 "\x00\xff",
506 "\x00\xff\u0100",
507 "\x00\xff\u0100",
508 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000509 ]
510 )
511
Georg Brandl791f4e12009-09-17 11:41:24 +0000512 def test_handlers(self):
513 self.assertEqual(('\ufffd', 1),
514 codecs.utf_16_decode(b'\x01', 'replace', True))
515 self.assertEqual(('', 1),
516 codecs.utf_16_decode(b'\x01', 'ignore', True))
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000519 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000520 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000521
522 def test_decoder_state(self):
523 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000524 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000525 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000526 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000528 def test_bug691291(self):
529 # Files are always opened in binary mode, even if no binary mode was
530 # specified. This means that no automatic conversion of '\n' is done
531 # on reading and writing.
532 s1 = 'Hello\r\nworld\r\n'
533
534 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200535 self.addCleanup(support.unlink, support.TESTFN)
536 with open(support.TESTFN, 'wb') as fp:
537 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200538 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200539 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000540
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000541class UTF16LETest(ReadTest):
542 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000543
544 def test_partial(self):
545 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000546 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000547 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000548 "",
549 "\x00",
550 "\x00",
551 "\x00\xff",
552 "\x00\xff",
553 "\x00\xff\u0100",
554 "\x00\xff\u0100",
555 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000556 ]
557 )
558
Walter Dörwalde22d3392005-11-17 08:52:34 +0000559 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000560 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000561 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000562
Victor Stinner53a9dd72010-12-08 22:25:45 +0000563 def test_nonbmp(self):
564 self.assertEqual("\U00010203".encode(self.encoding),
565 b'\x00\xd8\x03\xde')
566 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
567 "\U00010203")
568
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569class UTF16BETest(ReadTest):
570 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000571
572 def test_partial(self):
573 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000574 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000575 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000576 "",
577 "\x00",
578 "\x00",
579 "\x00\xff",
580 "\x00\xff",
581 "\x00\xff\u0100",
582 "\x00\xff\u0100",
583 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000584 ]
585 )
586
Walter Dörwalde22d3392005-11-17 08:52:34 +0000587 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000588 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000589 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000590
Victor Stinner53a9dd72010-12-08 22:25:45 +0000591 def test_nonbmp(self):
592 self.assertEqual("\U00010203".encode(self.encoding),
593 b'\xd8\x00\xde\x03')
594 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
595 "\U00010203")
596
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000597class UTF8Test(ReadTest):
598 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000599
600 def test_partial(self):
601 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000602 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000603 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000604 "\x00",
605 "\x00",
606 "\x00\xff",
607 "\x00\xff",
608 "\x00\xff\u07ff",
609 "\x00\xff\u07ff",
610 "\x00\xff\u07ff",
611 "\x00\xff\u07ff\u0800",
612 "\x00\xff\u07ff\u0800",
613 "\x00\xff\u07ff\u0800",
614 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000615 ]
616 )
617
Walter Dörwald3abcb012007-04-16 22:10:50 +0000618 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000619 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000620 self.check_state_handling_decode(self.encoding,
621 u, u.encode(self.encoding))
622
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000623 def test_lone_surrogates(self):
624 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
625 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000626 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
627 b'[\\udc80]')
628 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
629 b'[&#56448;]')
630 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
631 b'[\x80]')
632 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
633 b'[]')
634 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
635 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000636
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000637 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000638 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
639 b"abc\xed\xa0\x80def")
640 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
641 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200642 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
643 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
644 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
645 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000646 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000647
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200648@unittest.skipUnless(sys.platform == 'win32',
649 'cp65001 is a Windows-only codec')
650class CP65001Test(ReadTest):
651 encoding = "cp65001"
652
653 def test_encode(self):
654 tests = [
655 ('abc', 'strict', b'abc'),
656 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
657 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
658 ]
659 if VISTA_OR_LATER:
660 tests.extend((
661 ('\udc80', 'strict', None),
662 ('\udc80', 'ignore', b''),
663 ('\udc80', 'replace', b'?'),
664 ('\udc80', 'backslashreplace', b'\\udc80'),
665 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
666 ))
667 else:
668 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
669 for text, errors, expected in tests:
670 if expected is not None:
671 try:
672 encoded = text.encode('cp65001', errors)
673 except UnicodeEncodeError as err:
674 self.fail('Unable to encode %a to cp65001 with '
675 'errors=%r: %s' % (text, errors, err))
676 self.assertEqual(encoded, expected,
677 '%a.encode("cp65001", %r)=%a != %a'
678 % (text, errors, encoded, expected))
679 else:
680 self.assertRaises(UnicodeEncodeError,
681 text.encode, "cp65001", errors)
682
683 def test_decode(self):
684 tests = [
685 (b'abc', 'strict', 'abc'),
686 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
687 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
688 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
689 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
690 # invalid bytes
691 (b'[\xff]', 'strict', None),
692 (b'[\xff]', 'ignore', '[]'),
693 (b'[\xff]', 'replace', '[\ufffd]'),
694 (b'[\xff]', 'surrogateescape', '[\udcff]'),
695 ]
696 if VISTA_OR_LATER:
697 tests.extend((
698 (b'[\xed\xb2\x80]', 'strict', None),
699 (b'[\xed\xb2\x80]', 'ignore', '[]'),
700 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
701 ))
702 else:
703 tests.extend((
704 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
705 ))
706 for raw, errors, expected in tests:
707 if expected is not None:
708 try:
709 decoded = raw.decode('cp65001', errors)
710 except UnicodeDecodeError as err:
711 self.fail('Unable to decode %a from cp65001 with '
712 'errors=%r: %s' % (raw, errors, err))
713 self.assertEqual(decoded, expected,
714 '%a.decode("cp65001", %r)=%a != %a'
715 % (raw, errors, decoded, expected))
716 else:
717 self.assertRaises(UnicodeDecodeError,
718 raw.decode, 'cp65001', errors)
719
720 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
721 def test_lone_surrogates(self):
722 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
723 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
724 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
725 b'[\\udc80]')
726 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
727 b'[&#56448;]')
728 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
729 b'[\x80]')
730 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
731 b'[]')
732 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
733 b'[?]')
734
735 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
736 def test_surrogatepass_handler(self):
737 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
738 b"abc\xed\xa0\x80def")
739 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
740 "abc\ud800def")
741 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
742 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
743 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
744 "\U00010fff\uD800")
745 self.assertTrue(codecs.lookup_error("surrogatepass"))
746
747
748
Walter Dörwalde22d3392005-11-17 08:52:34 +0000749class UTF7Test(ReadTest):
750 encoding = "utf-7"
751
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000752 def test_partial(self):
753 self.check_partial(
754 "a+-b",
755 [
756 "a",
757 "a",
758 "a+",
759 "a+-",
760 "a+-b",
761 ]
762 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000763
764class UTF16ExTest(unittest.TestCase):
765
766 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000767 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000768
769 def test_bad_args(self):
770 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
771
772class ReadBufferTest(unittest.TestCase):
773
774 def test_array(self):
775 import array
776 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000777 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000778 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000779 )
780
781 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000782 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000783
784 def test_bad_args(self):
785 self.assertRaises(TypeError, codecs.readbuffer_encode)
786 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
787
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000788class UTF8SigTest(ReadTest):
789 encoding = "utf-8-sig"
790
791 def test_partial(self):
792 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000793 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000794 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000795 "",
796 "",
797 "", # First BOM has been read and skipped
798 "",
799 "",
800 "\ufeff", # Second BOM has been read and emitted
801 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000802 "\ufeff\x00", # First byte of encoded "\xff" read
803 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
804 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
805 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000806 "\ufeff\x00\xff\u07ff",
807 "\ufeff\x00\xff\u07ff",
808 "\ufeff\x00\xff\u07ff\u0800",
809 "\ufeff\x00\xff\u07ff\u0800",
810 "\ufeff\x00\xff\u07ff\u0800",
811 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000812 ]
813 )
814
Thomas Wouters89f507f2006-12-13 04:49:30 +0000815 def test_bug1601501(self):
816 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000817 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000818
Walter Dörwald3abcb012007-04-16 22:10:50 +0000819 def test_bom(self):
820 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000821 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000822 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
823
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000824 def test_stream_bom(self):
825 unistring = "ABC\u00A1\u2200XYZ"
826 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
827
828 reader = codecs.getreader("utf-8-sig")
829 for sizehint in [None] + list(range(1, 11)) + \
830 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200831 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000832 ostream = io.StringIO()
833 while 1:
834 if sizehint is not None:
835 data = istream.read(sizehint)
836 else:
837 data = istream.read()
838
839 if not data:
840 break
841 ostream.write(data)
842
843 got = ostream.getvalue()
844 self.assertEqual(got, unistring)
845
846 def test_stream_bare(self):
847 unistring = "ABC\u00A1\u2200XYZ"
848 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
849
850 reader = codecs.getreader("utf-8-sig")
851 for sizehint in [None] + list(range(1, 11)) + \
852 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200853 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000854 ostream = io.StringIO()
855 while 1:
856 if sizehint is not None:
857 data = istream.read(sizehint)
858 else:
859 data = istream.read()
860
861 if not data:
862 break
863 ostream.write(data)
864
865 got = ostream.getvalue()
866 self.assertEqual(got, unistring)
867
868class EscapeDecodeTest(unittest.TestCase):
869 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000870 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000871
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000872class RecodingTest(unittest.TestCase):
873 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000874 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200875 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000876 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000877 f2.close()
878 # Python used to crash on this at exit because of a refcount
879 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000880
Martin v. Löwis2548c732003-04-18 10:39:54 +0000881# From RFC 3492
882punycode_testcases = [
883 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
885 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000886 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000887 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000888 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000889 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000890 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000891 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000892 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000893 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000894 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
895 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
896 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000897 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000898 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000899 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
900 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
901 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000902 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000903 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000904 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000905 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
906 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
907 "\u0939\u0948\u0902",
908 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000909
910 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000911 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000912 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
913 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000914
915 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000916 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
917 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
918 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000919 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
920 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000921
922 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000923 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
924 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
925 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
926 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000927 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000928
929 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000930 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
931 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
932 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
933 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
934 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000935 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936
937 # (K) Vietnamese:
938 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
939 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000940 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
941 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
942 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
943 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000944 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000945
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000947 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000948 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000949
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000951 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
952 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
953 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000954 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000955
956 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000957 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
958 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
959 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000960 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961
962 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000963 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000964 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000965
966 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000967 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
968 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000969 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970
971 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000972 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000973 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000974
975 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000976 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000977 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000978
979 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000980 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
981 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000982 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000983 ]
984
985for i in punycode_testcases:
986 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000987 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988
989class PunycodeTest(unittest.TestCase):
990 def test_encode(self):
991 for uni, puny in punycode_testcases:
992 # Need to convert both strings to lower case, since
993 # some of the extended encodings use upper case, but our
994 # code produces only lower case. Converting just puny to
995 # lower is also insufficient, since some of the input characters
996 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000997 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000998 str(uni.encode("punycode"), "ascii").lower(),
999 str(puny, "ascii").lower()
1000 )
Martin v. Löwis2548c732003-04-18 10:39:54 +00001001
1002 def test_decode(self):
1003 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001004 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001005 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001006 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001007
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001008class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001009 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001010 def test_bug1251300(self):
1011 # Decoding with unicode_internal used to not correctly handle "code
1012 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001013 ok = [
1014 (b"\x00\x10\xff\xff", "\U0010ffff"),
1015 (b"\x00\x00\x01\x01", "\U00000101"),
1016 (b"", ""),
1017 ]
1018 not_ok = [
1019 b"\x7f\xff\xff\xff",
1020 b"\x80\x00\x00\x00",
1021 b"\x81\x00\x00\x00",
1022 b"\x00",
1023 b"\x00\x00\x00\x00\x00",
1024 ]
1025 for internal, uni in ok:
1026 if sys.byteorder == "little":
1027 internal = bytes(reversed(internal))
Ezio Melotti11060a42011-11-16 09:39:10 +02001028 with support.check_warnings():
1029 self.assertEqual(uni, internal.decode("unicode_internal"))
Victor Stinner182d90d2011-09-29 19:53:55 +02001030 for internal in not_ok:
1031 if sys.byteorder == "little":
1032 internal = bytes(reversed(internal))
Ezio Melotti345379a2011-11-16 09:54:19 +02001033 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001034 'deprecated', DeprecationWarning)):
1035 self.assertRaises(UnicodeDecodeError, internal.decode,
1036 "unicode_internal")
Victor Stinnere3b47152011-12-09 20:49:49 +01001037 if sys.byteorder == "little":
1038 invalid = b"\x00\x00\x11\x00"
1039 else:
1040 invalid = b"\x00\x11\x00\x00"
1041 with support.check_warnings():
1042 self.assertRaises(UnicodeDecodeError,
1043 invalid.decode, "unicode_internal")
1044 with support.check_warnings():
1045 self.assertEqual(invalid.decode("unicode_internal", "replace"),
1046 '\ufffd')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001047
Victor Stinner182d90d2011-09-29 19:53:55 +02001048 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001049 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001050 try:
Ezio Melotti345379a2011-11-16 09:54:19 +02001051 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001052 'deprecated', DeprecationWarning)):
1053 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
Victor Stinner182d90d2011-09-29 19:53:55 +02001054 except UnicodeDecodeError as ex:
1055 self.assertEqual("unicode_internal", ex.encoding)
1056 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1057 self.assertEqual(4, ex.start)
1058 self.assertEqual(8, ex.end)
1059 else:
1060 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001061
Victor Stinner182d90d2011-09-29 19:53:55 +02001062 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001063 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001064 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1065 decoder = codecs.getdecoder("unicode_internal")
Ezio Melotti345379a2011-11-16 09:54:19 +02001066 with support.check_warnings(('unicode_internal codec has been '
Ezio Melotti11060a42011-11-16 09:39:10 +02001067 'deprecated', DeprecationWarning)):
1068 ab = "ab".encode("unicode_internal").decode()
1069 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1070 "ascii"),
1071 "UnicodeInternalTest")
Victor Stinner182d90d2011-09-29 19:53:55 +02001072 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001073
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001074 def test_encode_length(self):
Ezio Melottiadc417c2011-11-17 12:23:34 +02001075 with support.check_warnings(('unicode_internal codec has been '
1076 'deprecated', DeprecationWarning)):
Victor Stinner040e16e2011-11-15 22:44:05 +01001077 # Issue 3739
1078 encoder = codecs.getencoder("unicode_internal")
1079 self.assertEqual(encoder("a")[1], 1)
1080 self.assertEqual(encoder("\xe9\u0142")[1], 2)
1081
1082 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001083
Martin v. Löwis2548c732003-04-18 10:39:54 +00001084# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1085nameprep_tests = [
1086 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001087 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1088 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1089 b'\xb8\x8f\xef\xbb\xbf',
1090 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001091 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001092 (b'CAFE',
1093 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094 # 3.3 Case folding 8bit U+00DF (german sharp s).
1095 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001096 (b'\xc3\x9f',
1097 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001098 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001099 (b'\xc4\xb0',
1100 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001102 (b'\xc5\x83\xcd\xba',
1103 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001104 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1105 # XXX: skip this as it fails in UCS-2 mode
1106 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1107 # 'telc\xe2\x88\x95kg\xcf\x83'),
1108 (None, None),
1109 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001110 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1111 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001112 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001113 (b'\xe1\xbe\xb7',
1114 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001115 # 3.9 Self-reverting case folding U+01F0 and normalization.
1116 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001117 (b'\xc7\xb0',
1118 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001119 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001120 (b'\xce\x90',
1121 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001123 (b'\xce\xb0',
1124 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001125 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001126 (b'\xe1\xba\x96',
1127 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001128 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001129 (b'\xe1\xbd\x96',
1130 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001132 (b' ',
1133 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001134 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001135 (b'\xc2\xa0',
1136 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001137 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001138 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001139 None),
1140 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001141 (b'\xe2\x80\x80',
1142 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001144 (b'\xe2\x80\x8b',
1145 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001146 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001147 (b'\xe3\x80\x80',
1148 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001150 (b'\x10\x7f',
1151 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001153 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001154 None),
1155 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001156 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001157 None),
1158 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001159 (b'\xef\xbb\xbf',
1160 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001162 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001163 None),
1164 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001165 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001166 None),
1167 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001168 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001169 None),
1170 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001171 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001172 None),
1173 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001174 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001175 None),
1176 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001178 None),
1179 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001180 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001181 None),
1182 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001183 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001184 None),
1185 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001186 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001187 None),
1188 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001189 (b'\xcd\x81',
1190 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001192 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001193 None),
1194 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001195 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001196 None),
1197 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001198 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001199 None),
1200 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001201 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001202 None),
1203 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001204 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205 None),
1206 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001207 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001208 None),
1209 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001210 (b'foo\xef\xb9\xb6bar',
1211 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001213 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001214 None),
1215 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001216 (b'\xd8\xa71\xd8\xa8',
1217 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001218 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001219 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001220 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001221 # None),
1222 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001223 # 3.44 Larger test (shrinking).
1224 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001225 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1226 b'\xaa\xce\xb0\xe2\x80\x80',
1227 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001228 # 3.45 Larger test (expanding).
1229 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001230 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1231 b'\x80',
1232 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1233 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1234 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001235 ]
1236
1237
1238class NameprepTest(unittest.TestCase):
1239 def test_nameprep(self):
1240 from encodings.idna import nameprep
1241 for pos, (orig, prepped) in enumerate(nameprep_tests):
1242 if orig is None:
1243 # Skipped
1244 continue
1245 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001246 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001247 if prepped is None:
1248 # Input contains prohibited characters
1249 self.assertRaises(UnicodeError, nameprep, orig)
1250 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001251 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001252 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001253 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001254 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001255 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001256
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001257class IDNACodecTest(unittest.TestCase):
1258 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001259 self.assertEqual(str(b"python.org", "idna"), "python.org")
1260 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1261 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1262 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001263
1264 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001265 self.assertEqual("python.org".encode("idna"), b"python.org")
1266 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1267 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1268 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001269
Martin v. Löwis8b595142005-08-25 11:03:38 +00001270 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001271 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001272 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001273 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001274
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001275 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001276 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001277 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001278 "python.org"
1279 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001280 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001281 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001282 "python.org."
1283 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001284 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001285 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001286 "pyth\xf6n.org."
1287 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001288 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001289 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001290 "pyth\xf6n.org."
1291 )
1292
1293 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001294 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1295 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1296 self.assertEqual(decoder.decode(b"rg"), "")
1297 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001298
1299 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001300 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1301 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1302 self.assertEqual(decoder.decode(b"rg."), "org.")
1303 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001304
1305 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001306 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001307 b"".join(codecs.iterencode("python.org", "idna")),
1308 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001309 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001310 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001311 b"".join(codecs.iterencode("python.org.", "idna")),
1312 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001313 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001314 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001315 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1316 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001317 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001318 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001319 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1320 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001321 )
1322
1323 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001324 self.assertEqual(encoder.encode("\xe4x"), b"")
1325 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1326 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001327
1328 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001329 self.assertEqual(encoder.encode("\xe4x"), b"")
1330 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1331 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001332
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001333class CodecsModuleTest(unittest.TestCase):
1334
1335 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001336 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1337 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001338 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001339 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001340 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001341
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001342 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001343 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1344 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001345 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001346 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001347 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001348 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001349
1350 def test_register(self):
1351 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001352 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001353
1354 def test_lookup(self):
1355 self.assertRaises(TypeError, codecs.lookup)
1356 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001357 self.assertRaises(LookupError, codecs.lookup, " ")
1358
1359 def test_getencoder(self):
1360 self.assertRaises(TypeError, codecs.getencoder)
1361 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1362
1363 def test_getdecoder(self):
1364 self.assertRaises(TypeError, codecs.getdecoder)
1365 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1366
1367 def test_getreader(self):
1368 self.assertRaises(TypeError, codecs.getreader)
1369 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1370
1371 def test_getwriter(self):
1372 self.assertRaises(TypeError, codecs.getwriter)
1373 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001374
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001375 def test_lookup_issue1813(self):
1376 # Issue #1813: under Turkish locales, lookup of some codecs failed
1377 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001378 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001379 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1380 try:
1381 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1382 except locale.Error:
1383 # Unsupported locale on this system
1384 self.skipTest('test needs Turkish locale')
1385 c = codecs.lookup('ASCII')
1386 self.assertEqual(c.name, 'ascii')
1387
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001388class StreamReaderTest(unittest.TestCase):
1389
1390 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001391 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001392 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001393
1394 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001395 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001396 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001397
Thomas Wouters89f507f2006-12-13 04:49:30 +00001398class EncodedFileTest(unittest.TestCase):
1399
1400 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001401 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001402 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001403 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001404
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001405 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001406 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001407 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001408 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001409
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001410all_unicode_encodings = [
1411 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 "big5",
1413 "big5hkscs",
1414 "charmap",
1415 "cp037",
1416 "cp1006",
1417 "cp1026",
1418 "cp1140",
1419 "cp1250",
1420 "cp1251",
1421 "cp1252",
1422 "cp1253",
1423 "cp1254",
1424 "cp1255",
1425 "cp1256",
1426 "cp1257",
1427 "cp1258",
1428 "cp424",
1429 "cp437",
1430 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001431 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001432 "cp737",
1433 "cp775",
1434 "cp850",
1435 "cp852",
1436 "cp855",
1437 "cp856",
1438 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001439 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001440 "cp860",
1441 "cp861",
1442 "cp862",
1443 "cp863",
1444 "cp864",
1445 "cp865",
1446 "cp866",
1447 "cp869",
1448 "cp874",
1449 "cp875",
1450 "cp932",
1451 "cp949",
1452 "cp950",
1453 "euc_jis_2004",
1454 "euc_jisx0213",
1455 "euc_jp",
1456 "euc_kr",
1457 "gb18030",
1458 "gb2312",
1459 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001460 "hp_roman8",
1461 "hz",
1462 "idna",
1463 "iso2022_jp",
1464 "iso2022_jp_1",
1465 "iso2022_jp_2",
1466 "iso2022_jp_2004",
1467 "iso2022_jp_3",
1468 "iso2022_jp_ext",
1469 "iso2022_kr",
1470 "iso8859_1",
1471 "iso8859_10",
1472 "iso8859_11",
1473 "iso8859_13",
1474 "iso8859_14",
1475 "iso8859_15",
1476 "iso8859_16",
1477 "iso8859_2",
1478 "iso8859_3",
1479 "iso8859_4",
1480 "iso8859_5",
1481 "iso8859_6",
1482 "iso8859_7",
1483 "iso8859_8",
1484 "iso8859_9",
1485 "johab",
1486 "koi8_r",
1487 "koi8_u",
1488 "latin_1",
1489 "mac_cyrillic",
1490 "mac_greek",
1491 "mac_iceland",
1492 "mac_latin2",
1493 "mac_roman",
1494 "mac_turkish",
1495 "palmos",
1496 "ptcp154",
1497 "punycode",
1498 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001499 "shift_jis",
1500 "shift_jis_2004",
1501 "shift_jisx0213",
1502 "tis_620",
1503 "unicode_escape",
1504 "unicode_internal",
1505 "utf_16",
1506 "utf_16_be",
1507 "utf_16_le",
1508 "utf_7",
1509 "utf_8",
1510]
1511
1512if hasattr(codecs, "mbcs_encode"):
1513 all_unicode_encodings.append("mbcs")
1514
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001515# The following encoding is not tested, because it's not supposed
1516# to work:
1517# "undefined"
1518
1519# The following encodings don't work in stateful mode
1520broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001521 "punycode",
1522 "unicode_internal"
1523]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001524broken_incremental_coders = broken_unicode_with_streams + [
1525 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001526]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001527
Walter Dörwald3abcb012007-04-16 22:10:50 +00001528class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001529 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001530 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001531 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001532 name = codecs.lookup(encoding).name
1533 if encoding.endswith("_codec"):
1534 name += "_codec"
1535 elif encoding == "latin_1":
1536 name = "latin_1"
1537 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Victor Stinner040e16e2011-11-15 22:44:05 +01001538
Ezio Melottiadc417c2011-11-17 12:23:34 +02001539 with support.check_warnings():
Victor Stinner040e16e2011-11-15 22:44:05 +01001540 # unicode-internal has been deprecated
Victor Stinner040e16e2011-11-15 22:44:05 +01001541 (b, size) = codecs.getencoder(encoding)(s)
1542 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1543 (chars, size) = codecs.getdecoder(encoding)(b)
1544 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001545
1546 if encoding not in broken_unicode_with_streams:
1547 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001548 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001549 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001550 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001551 for c in s:
1552 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001553 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001554 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001555 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001556 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001557 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001558 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001559 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001560 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001561 decodedresult += reader.read()
1562 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1563
Thomas Wouters89f507f2006-12-13 04:49:30 +00001564 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001565 # check incremental decoder/encoder (fetched via the Python
1566 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001567 try:
1568 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001569 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001570 except LookupError: # no IncrementalEncoder
1571 pass
1572 else:
1573 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001574 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001575 for c in s:
1576 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001577 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001578 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001579 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001580 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001581 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001582 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001583 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1584
1585 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001586 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001587 for c in s:
1588 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001589 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001590 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001591 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001592 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001593 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001594 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001595 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1596
1597 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001598 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001599 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1600
1601 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001602 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1603 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001604
Victor Stinner554f3f02010-06-16 23:33:54 +00001605 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001606 # check incremental decoder/encoder with errors argument
1607 try:
1608 encoder = codecs.getincrementalencoder(encoding)("ignore")
1609 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1610 except LookupError: # no IncrementalEncoder
1611 pass
1612 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001613 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001614 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001615 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001616 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1617
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001618 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001619 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001620 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001621 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1622
Walter Dörwald729c31f2005-03-14 19:06:30 +00001623 def test_seek(self):
1624 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001625 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001626 for encoding in all_unicode_encodings:
1627 if encoding == "idna": # FIXME: See SF bug #1163178
1628 continue
1629 if encoding in broken_unicode_with_streams:
1630 continue
Victor Stinner05010702011-05-27 16:50:40 +02001631 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001632 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001633 # Test that calling seek resets the internal codec state and buffers
1634 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001635 data = reader.read()
1636 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001637
Walter Dörwalde22d3392005-11-17 08:52:34 +00001638 def test_bad_decode_args(self):
1639 for encoding in all_unicode_encodings:
1640 decoder = codecs.getdecoder(encoding)
1641 self.assertRaises(TypeError, decoder)
1642 if encoding not in ("idna", "punycode"):
1643 self.assertRaises(TypeError, decoder, 42)
1644
1645 def test_bad_encode_args(self):
1646 for encoding in all_unicode_encodings:
1647 encoder = codecs.getencoder(encoding)
Ezio Melottiadc417c2011-11-17 12:23:34 +02001648 with support.check_warnings():
1649 # unicode-internal has been deprecated
1650 self.assertRaises(TypeError, encoder)
Walter Dörwalde22d3392005-11-17 08:52:34 +00001651
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001652 def test_encoding_map_type_initialized(self):
1653 from encodings import cp1140
1654 # This used to crash, we are only verifying there's no crash.
1655 table_type = type(cp1140.encoding_table)
1656 self.assertEqual(table_type, table_type)
1657
Walter Dörwald3abcb012007-04-16 22:10:50 +00001658 def test_decoder_state(self):
1659 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001660 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001661 for encoding in all_unicode_encodings:
1662 if encoding not in broken_incremental_coders:
1663 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1664 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1665
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001666class CharmapTest(unittest.TestCase):
1667 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001668 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001669 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001670 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001671 )
1672
Ezio Melottib3aedd42010-11-20 19:04:17 +00001673 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001674 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001675 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001676 )
1677
Ezio Melottib3aedd42010-11-20 19:04:17 +00001678 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001679 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001680 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001681 )
1682
Ezio Melottib3aedd42010-11-20 19:04:17 +00001683 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001684 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001685 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001686 )
1687
Ezio Melottib3aedd42010-11-20 19:04:17 +00001688 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001689 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001690 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001691 )
1692
Guido van Rossum805365e2007-05-07 22:24:25 +00001693 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001694 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001695 codecs.charmap_decode(allbytes, "ignore", ""),
1696 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001697 )
1698
Thomas Wouters89f507f2006-12-13 04:49:30 +00001699class WithStmtTest(unittest.TestCase):
1700 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001701 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001702 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1703 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001704
1705 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001706 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001707 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001708 with codecs.StreamReaderWriter(f, info.streamreader,
1709 info.streamwriter, 'strict') as srw:
1710 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001711
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001712class TypesTest(unittest.TestCase):
1713 def test_decode_unicode(self):
1714 # Most decoders don't accept unicode input
1715 decoders = [
1716 codecs.utf_7_decode,
1717 codecs.utf_8_decode,
1718 codecs.utf_16_le_decode,
1719 codecs.utf_16_be_decode,
1720 codecs.utf_16_ex_decode,
1721 codecs.utf_32_decode,
1722 codecs.utf_32_le_decode,
1723 codecs.utf_32_be_decode,
1724 codecs.utf_32_ex_decode,
1725 codecs.latin_1_decode,
1726 codecs.ascii_decode,
1727 codecs.charmap_decode,
1728 ]
1729 if hasattr(codecs, "mbcs_decode"):
1730 decoders.append(codecs.mbcs_decode)
1731 for decoder in decoders:
1732 self.assertRaises(TypeError, decoder, "xxx")
1733
1734 def test_unicode_escape(self):
1735 # Escape-decoding an unicode string is supported ang gives the same
1736 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001737 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1738 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1739 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1740 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001741
Victor Stinnere3b47152011-12-09 20:49:49 +01001742 self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
1743 self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1744
1745 self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
1746 self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
1747
Martin v. Löwis43c57782009-05-10 08:15:24 +00001748class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001749
1750 def test_utf8(self):
1751 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001752 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001753 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001754 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001755 b"foo\x80bar")
1756 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001757 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001758 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001759 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001760 b"\xed\xb0\x80")
1761
1762 def test_ascii(self):
1763 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001764 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001765 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001766 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001767 b"foo\x80bar")
1768
1769 def test_charmap(self):
1770 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001771 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001772 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001773 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001774 b"foo\xa5bar")
1775
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001776 def test_latin1(self):
1777 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001778 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001779 b"\xe4\xeb\xef\xf6\xfc")
1780
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001781
Victor Stinner3fed0872010-05-22 02:16:27 +00001782class BomTest(unittest.TestCase):
1783 def test_seek0(self):
1784 data = "1234567890"
1785 tests = ("utf-16",
1786 "utf-16-le",
1787 "utf-16-be",
1788 "utf-32",
1789 "utf-32-le",
1790 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001791 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001792 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001793 # Check if the BOM is written only once
1794 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001795 f.write(data)
1796 f.write(data)
1797 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001798 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001799 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001800 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001801
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001802 # Check that the BOM is written after a seek(0)
1803 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1804 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001805 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001806 f.seek(0)
1807 f.write(data)
1808 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001809 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001810
1811 # (StreamWriter) Check that the BOM is written after a seek(0)
1812 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001813 f.writer.write(data[0])
1814 self.assertNotEqual(f.writer.tell(), 0)
1815 f.writer.seek(0)
1816 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001817 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001818 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001819
Victor Stinner05010702011-05-27 16:50:40 +02001820 # Check that the BOM is not written after a seek() at a position
1821 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001822 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1823 f.write(data)
1824 f.seek(f.tell())
1825 f.write(data)
1826 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001827 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001828
Victor Stinner05010702011-05-27 16:50:40 +02001829 # (StreamWriter) Check that the BOM is not written after a seek()
1830 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001831 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001832 f.writer.write(data)
1833 f.writer.seek(f.writer.tell())
1834 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001835 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001836 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001837
Victor Stinner3fed0872010-05-22 02:16:27 +00001838
Georg Brandl02524622010-12-02 18:06:51 +00001839bytes_transform_encodings = [
1840 "base64_codec",
1841 "uu_codec",
1842 "quopri_codec",
1843 "hex_codec",
1844]
1845try:
1846 import zlib
1847except ImportError:
1848 pass
1849else:
1850 bytes_transform_encodings.append("zlib_codec")
1851try:
1852 import bz2
1853except ImportError:
1854 pass
1855else:
1856 bytes_transform_encodings.append("bz2_codec")
1857
1858class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001859
Georg Brandl02524622010-12-02 18:06:51 +00001860 def test_basics(self):
1861 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001862 for encoding in bytes_transform_encodings:
1863 # generic codecs interface
1864 (o, size) = codecs.getencoder(encoding)(binput)
1865 self.assertEqual(size, len(binput))
1866 (i, size) = codecs.getdecoder(encoding)(o)
1867 self.assertEqual(size, len(o))
1868 self.assertEqual(i, binput)
1869
Georg Brandl02524622010-12-02 18:06:51 +00001870 def test_read(self):
1871 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001872 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001873 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001874 sout = reader.read()
1875 self.assertEqual(sout, b"\x80")
1876
1877 def test_readline(self):
1878 for encoding in bytes_transform_encodings:
1879 if encoding in ['uu_codec', 'zlib_codec']:
1880 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001881 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001882 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001883 sout = reader.readline()
1884 self.assertEqual(sout, b"\x80")
1885
1886
Victor Stinner62be4fb2011-10-18 21:46:37 +02001887@unittest.skipUnless(sys.platform == 'win32',
1888 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02001889class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001890 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02001891 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02001892
Victor Stinner3a50e702011-10-18 21:21:00 +02001893 def test_invalid_code_page(self):
1894 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
1895 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
1896 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
1897 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
1898
1899 def test_code_page_name(self):
1900 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
1901 codecs.code_page_encode, 932, '\xff')
1902 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
1903 codecs.code_page_decode, 932, b'\x81\x00')
1904 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
1905 codecs.code_page_decode, self.CP_UTF8, b'\xff')
1906
1907 def check_decode(self, cp, tests):
1908 for raw, errors, expected in tests:
1909 if expected is not None:
1910 try:
1911 decoded = codecs.code_page_decode(cp, raw, errors)
1912 except UnicodeDecodeError as err:
1913 self.fail('Unable to decode %a from "cp%s" with '
1914 'errors=%r: %s' % (raw, cp, errors, err))
1915 self.assertEqual(decoded[0], expected,
1916 '%a.decode("cp%s", %r)=%a != %a'
1917 % (raw, cp, errors, decoded[0], expected))
1918 # assert 0 <= decoded[1] <= len(raw)
1919 self.assertGreaterEqual(decoded[1], 0)
1920 self.assertLessEqual(decoded[1], len(raw))
1921 else:
1922 self.assertRaises(UnicodeDecodeError,
1923 codecs.code_page_decode, cp, raw, errors)
1924
1925 def check_encode(self, cp, tests):
1926 for text, errors, expected in tests:
1927 if expected is not None:
1928 try:
1929 encoded = codecs.code_page_encode(cp, text, errors)
1930 except UnicodeEncodeError as err:
1931 self.fail('Unable to encode %a to "cp%s" with '
1932 'errors=%r: %s' % (text, cp, errors, err))
1933 self.assertEqual(encoded[0], expected,
1934 '%a.encode("cp%s", %r)=%a != %a'
1935 % (text, cp, errors, encoded[0], expected))
1936 self.assertEqual(encoded[1], len(text))
1937 else:
1938 self.assertRaises(UnicodeEncodeError,
1939 codecs.code_page_encode, cp, text, errors)
1940
1941 def test_cp932(self):
1942 self.check_encode(932, (
1943 ('abc', 'strict', b'abc'),
1944 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001945 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02001946 ('\xff', 'strict', None),
1947 ('[\xff]', 'ignore', b'[]'),
1948 ('[\xff]', 'replace', b'[y]'),
1949 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001950 ('[\xff]', 'backslashreplace', b'[\\xff]'),
1951 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02001952 ))
Victor Stinner9e921882011-10-18 21:55:25 +02001953 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02001954 (b'abc', 'strict', 'abc'),
1955 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
1956 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001957 (b'[\xff]', 'strict', None),
1958 (b'[\xff]', 'ignore', '[]'),
1959 (b'[\xff]', 'replace', '[\ufffd]'),
1960 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02001961 (b'\x81\x00abc', 'strict', None),
1962 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02001963 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
1964 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02001965
1966 def test_cp1252(self):
1967 self.check_encode(1252, (
1968 ('abc', 'strict', b'abc'),
1969 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
1970 ('\xff', 'strict', b'\xff'),
1971 ('\u0141', 'strict', None),
1972 ('\u0141', 'ignore', b''),
1973 ('\u0141', 'replace', b'L'),
1974 ))
1975 self.check_decode(1252, (
1976 (b'abc', 'strict', 'abc'),
1977 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
1978 (b'\xff', 'strict', '\xff'),
1979 ))
1980
1981 def test_cp_utf7(self):
1982 cp = 65000
1983 self.check_encode(cp, (
1984 ('abc', 'strict', b'abc'),
1985 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
1986 ('\U0010ffff', 'strict', b'+2//f/w-'),
1987 ('\udc80', 'strict', b'+3IA-'),
1988 ('\ufffd', 'strict', b'+//0-'),
1989 ))
1990 self.check_decode(cp, (
1991 (b'abc', 'strict', 'abc'),
1992 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
1993 (b'+2//f/w-', 'strict', '\U0010ffff'),
1994 (b'+3IA-', 'strict', '\udc80'),
1995 (b'+//0-', 'strict', '\ufffd'),
1996 # invalid bytes
1997 (b'[+/]', 'strict', '[]'),
1998 (b'[\xff]', 'strict', '[\xff]'),
1999 ))
2000
Victor Stinner3a50e702011-10-18 21:21:00 +02002001 def test_multibyte_encoding(self):
2002 self.check_decode(932, (
2003 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
2004 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
2005 ))
2006 self.check_decode(self.CP_UTF8, (
2007 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
2008 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
2009 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002010 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02002011 self.check_encode(self.CP_UTF8, (
2012 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
2013 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
2014 ))
2015
2016 def test_incremental(self):
Victor Stinner76a31a62011-11-04 00:05:13 +01002017 decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
2018 self.assertEqual(decoded, ('', 0))
2019
Victor Stinner3a50e702011-10-18 21:21:00 +02002020 decoded = codecs.code_page_decode(932,
2021 b'\xe9\x80\xe9', 'strict',
2022 False)
2023 self.assertEqual(decoded, ('\u9a3e', 2))
2024
2025 decoded = codecs.code_page_decode(932,
2026 b'\xe9\x80\xe9\x80', 'strict',
2027 False)
2028 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
2029
2030 decoded = codecs.code_page_decode(932,
2031 b'abc', 'strict',
2032 False)
2033 self.assertEqual(decoded, ('abc', 3))
2034
2035
Fred Drake2e2be372001-09-20 21:33:42 +00002036def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002037 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002038 UTF32Test,
2039 UTF32LETest,
2040 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002041 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002042 UTF16LETest,
2043 UTF16BETest,
2044 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002045 UTF8SigTest,
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002046 CP65001Test,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002047 UTF7Test,
2048 UTF16ExTest,
2049 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002050 RecodingTest,
2051 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002052 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002053 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002054 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002055 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002056 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002057 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002058 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002059 CharmapTest,
2060 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002061 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002062 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002063 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002064 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02002065 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002066 )
Fred Drake2e2be372001-09-20 21:33:42 +00002067
2068
2069if __name__ == "__main__":
2070 test_main()