blob: ffd2d790e83f65d28f18e2360af9440e8991d742 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Victor Stinner98fe1a02011-05-27 01:51:18 +02002import unittest
Victor Stinner05010702011-05-27 16:50:40 +02003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner05010702011-05-27 16:50:40 +02005import sys, _testcapi, io
Victor Stinner182d90d2011-09-29 19:53:55 +02006
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02007if sys.platform == 'win32':
8 VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
9else:
10 VISTA_OR_LATER = False
11
Antoine Pitrou00b2c862011-10-05 13:01:41 +020012try:
13 import ctypes
14except ImportError:
15 ctypes = None
16 SIZEOF_WCHAR_T = -1
17else:
18 SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +000019
Walter Dörwald69652032004-09-07 20:24:22 +000020class Queue(object):
21 """
22 queue: write bytes at one end, read bytes from the other end
23 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000024 def __init__(self, buffer):
25 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000026
27 def write(self, chars):
28 self._buffer += chars
29
30 def read(self, size=-1):
31 if size<0:
32 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000033 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000034 return s
35 else:
36 s = self._buffer[:size]
37 self._buffer = self._buffer[size:]
38 return s
39
Walter Dörwald3abcb012007-04-16 22:10:50 +000040class MixInCheckStateHandling:
41 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000042 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000043 d = codecs.getincrementaldecoder(encoding)()
44 part1 = d.decode(s[:i])
45 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000046 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000047 # Check that the condition stated in the documentation for
48 # IncrementalDecoder.getstate() holds
49 if not state[1]:
50 # reset decoder to the default state without anything buffered
51 d.setstate((state[0][:0], 0))
52 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000053 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000054 # The decoder must return to the same state
55 self.assertEqual(state, d.getstate())
56 # Create a new decoder and set it to the state
57 # we extracted from the old one
58 d = codecs.getincrementaldecoder(encoding)()
59 d.setstate(state)
60 part2 = d.decode(s[i:], True)
61 self.assertEqual(u, part1+part2)
62
63 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000064 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000065 d = codecs.getincrementalencoder(encoding)()
66 part1 = d.encode(u[:i])
67 state = d.getstate()
68 d = codecs.getincrementalencoder(encoding)()
69 d.setstate(state)
70 part2 = d.encode(u[i:], True)
71 self.assertEqual(s, part1+part2)
72
73class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000074 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000075 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000076 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000077 # the StreamReader and check that the results equal the appropriate
78 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000079 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020080 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000081 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000082 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000083 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000084 result += r.read()
85 self.assertEqual(result, partialresult)
86 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000087 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000088 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000089
Thomas Woutersa9773292006-04-21 09:43:23 +000090 # do the check again, this time using a incremental decoder
91 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000093 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000095 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000097 self.assertEqual(d.decode(b"", True), "")
98 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000099
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000100 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +0000101 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000102 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000103 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 self.assertEqual(result, partialresult)
106 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000107 self.assertEqual(d.decode(b"", True), "")
108 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +0000109
110 # check iterdecode()
111 encoded = input.encode(self.encoding)
112 self.assertEqual(
113 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000114 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000115 )
116
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117 def test_readline(self):
118 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000119 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000120 return codecs.getreader(self.encoding)(stream)
121
Walter Dörwaldca199432006-03-06 22:39:12 +0000122 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200123 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000124 lines = []
125 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000126 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000127 if not line:
128 break
129 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000130 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000131
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000132 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
133 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
134 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000135 self.assertEqual(readalllines(s, True), sexpected)
136 self.assertEqual(readalllines(s, False), sexpectednoends)
137 self.assertEqual(readalllines(s, True, 10), sexpected)
138 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000139
140 # Test long lines (multiple calls to read() in readline())
141 vw = []
142 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000143 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
144 vw.append((i*200)*"\3042" + lineend)
145 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000146 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
147 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
148
149 # Test lines where the first read might end with \r, so the
150 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000151 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000152 for lineend in "\n \r\n \r \u2028".split():
153 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000154 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000155 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000156 self.assertEqual(
157 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000158 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000159 )
160 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000161 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000162 self.assertEqual(
163 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000164 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000165 )
166
167 def test_bug1175396(self):
168 s = [
169 '<%!--===================================================\r\n',
170 ' BLOG index page: show recent articles,\r\n',
171 ' today\'s articles, or articles of a specific date.\r\n',
172 '========================================================--%>\r\n',
173 '<%@inputencoding="ISO-8859-1"%>\r\n',
174 '<%@pagetemplate=TEMPLATE.y%>\r\n',
175 '<%@import=import frog.util, frog%>\r\n',
176 '<%@import=import frog.objects%>\r\n',
177 '<%@import=from frog.storageerrors import StorageError%>\r\n',
178 '<%\r\n',
179 '\r\n',
180 'import logging\r\n',
181 'log=logging.getLogger("Snakelets.logger")\r\n',
182 '\r\n',
183 '\r\n',
184 'user=self.SessionCtx.user\r\n',
185 'storageEngine=self.SessionCtx.storageEngine\r\n',
186 '\r\n',
187 '\r\n',
188 'def readArticlesFromDate(date, count=None):\r\n',
189 ' entryids=storageEngine.listBlogEntries(date)\r\n',
190 ' entryids.reverse() # descending\r\n',
191 ' if count:\r\n',
192 ' entryids=entryids[:count]\r\n',
193 ' try:\r\n',
194 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
195 ' except StorageError,x:\r\n',
196 ' log.error("Error loading articles: "+str(x))\r\n',
197 ' self.abort("cannot load articles")\r\n',
198 '\r\n',
199 'showdate=None\r\n',
200 '\r\n',
201 'arg=self.Request.getArg()\r\n',
202 'if arg=="today":\r\n',
203 ' #-------------------- TODAY\'S ARTICLES\r\n',
204 ' self.write("<h2>Today\'s articles</h2>")\r\n',
205 ' showdate = frog.util.isodatestr() \r\n',
206 ' entries = readArticlesFromDate(showdate)\r\n',
207 'elif arg=="active":\r\n',
208 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
209 ' self.Yredirect("active.y")\r\n',
210 'elif arg=="login":\r\n',
211 ' #-------------------- LOGIN PAGE redirect\r\n',
212 ' self.Yredirect("login.y")\r\n',
213 'elif arg=="date":\r\n',
214 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
215 ' showdate = self.Request.getParameter("date")\r\n',
216 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
217 ' entries = readArticlesFromDate(showdate)\r\n',
218 'else:\r\n',
219 ' #-------------------- RECENT ARTICLES\r\n',
220 ' self.write("<h2>Recent articles</h2>")\r\n',
221 ' dates=storageEngine.listBlogEntryDates()\r\n',
222 ' if dates:\r\n',
223 ' entries=[]\r\n',
224 ' SHOWAMOUNT=10\r\n',
225 ' for showdate in dates:\r\n',
226 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
227 ' if len(entries)>=SHOWAMOUNT:\r\n',
228 ' break\r\n',
229 ' \r\n',
230 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000231 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200232 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000233 for (i, line) in enumerate(reader):
234 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000235
236 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000237 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200238 writer = codecs.getwriter(self.encoding)(q)
239 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000240
241 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000242 writer.write("foo\r")
243 self.assertEqual(reader.readline(keepends=False), "foo")
244 writer.write("\nbar\r")
245 self.assertEqual(reader.readline(keepends=False), "")
246 self.assertEqual(reader.readline(keepends=False), "bar")
247 writer.write("baz")
248 self.assertEqual(reader.readline(keepends=False), "baz")
249 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000250
251 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000252 writer.write("foo\r")
253 self.assertEqual(reader.readline(keepends=True), "foo\r")
254 writer.write("\nbar\r")
255 self.assertEqual(reader.readline(keepends=True), "\n")
256 self.assertEqual(reader.readline(keepends=True), "bar\r")
257 writer.write("baz")
258 self.assertEqual(reader.readline(keepends=True), "baz")
259 self.assertEqual(reader.readline(keepends=True), "")
260 writer.write("foo\r\n")
261 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000262
Walter Dörwald9fa09462005-01-10 12:01:39 +0000263 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
265 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
266 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000267
268 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000269 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200270 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000271 self.assertEqual(reader.readline(), s1)
272 self.assertEqual(reader.readline(), s2)
273 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000274 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000275
276 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000277 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
278 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
279 s3 = "stillokay:bbbbxx\r\n"
280 s4 = "broken!!!!badbad\r\n"
281 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000282
283 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000284 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200285 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000286 self.assertEqual(reader.readline(), s1)
287 self.assertEqual(reader.readline(), s2)
288 self.assertEqual(reader.readline(), s3)
289 self.assertEqual(reader.readline(), s4)
290 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000291 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000292
Walter Dörwald41980ca2007-08-16 21:55:45 +0000293class UTF32Test(ReadTest):
294 encoding = "utf-32"
295
296 spamle = (b'\xff\xfe\x00\x00'
297 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
298 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
299 spambe = (b'\x00\x00\xfe\xff'
300 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
301 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
302
303 def test_only_one_bom(self):
304 _,_,reader,writer = codecs.lookup(self.encoding)
305 # encode some stream
306 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200307 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000308 f.write("spam")
309 f.write("spam")
310 d = s.getvalue()
311 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000312 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000313 # try to read it back
314 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200315 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000316 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000317
318 def test_badbom(self):
319 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200320 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000321 self.assertRaises(UnicodeError, f.read)
322
323 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200324 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000325 self.assertRaises(UnicodeError, f.read)
326
327 def test_partial(self):
328 self.check_partial(
329 "\x00\xff\u0100\uffff",
330 [
331 "", # first byte of BOM read
332 "", # second byte of BOM read
333 "", # third byte of BOM read
334 "", # fourth byte of BOM read => byteorder known
335 "",
336 "",
337 "",
338 "\x00",
339 "\x00",
340 "\x00",
341 "\x00",
342 "\x00\xff",
343 "\x00\xff",
344 "\x00\xff",
345 "\x00\xff",
346 "\x00\xff\u0100",
347 "\x00\xff\u0100",
348 "\x00\xff\u0100",
349 "\x00\xff\u0100",
350 "\x00\xff\u0100\uffff",
351 ]
352 )
353
Georg Brandl791f4e12009-09-17 11:41:24 +0000354 def test_handlers(self):
355 self.assertEqual(('\ufffd', 1),
356 codecs.utf_32_decode(b'\x01', 'replace', True))
357 self.assertEqual(('', 1),
358 codecs.utf_32_decode(b'\x01', 'ignore', True))
359
Walter Dörwald41980ca2007-08-16 21:55:45 +0000360 def test_errors(self):
361 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
362 b"\xff", "strict", True)
363
364 def test_decoder_state(self):
365 self.check_state_handling_decode(self.encoding,
366 "spamspam", self.spamle)
367 self.check_state_handling_decode(self.encoding,
368 "spamspam", self.spambe)
369
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000370 def test_issue8941(self):
371 # Issue #8941: insufficient result allocation when decoding into
372 # surrogate pairs on UCS-2 builds.
373 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
374 self.assertEqual('\U00010000' * 1024,
375 codecs.utf_32_decode(encoded_le)[0])
376 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
377 self.assertEqual('\U00010000' * 1024,
378 codecs.utf_32_decode(encoded_be)[0])
379
Walter Dörwald41980ca2007-08-16 21:55:45 +0000380class UTF32LETest(ReadTest):
381 encoding = "utf-32-le"
382
383 def test_partial(self):
384 self.check_partial(
385 "\x00\xff\u0100\uffff",
386 [
387 "",
388 "",
389 "",
390 "\x00",
391 "\x00",
392 "\x00",
393 "\x00",
394 "\x00\xff",
395 "\x00\xff",
396 "\x00\xff",
397 "\x00\xff",
398 "\x00\xff\u0100",
399 "\x00\xff\u0100",
400 "\x00\xff\u0100",
401 "\x00\xff\u0100",
402 "\x00\xff\u0100\uffff",
403 ]
404 )
405
406 def test_simple(self):
407 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
408
409 def test_errors(self):
410 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
411 b"\xff", "strict", True)
412
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000413 def test_issue8941(self):
414 # Issue #8941: insufficient result allocation when decoding into
415 # surrogate pairs on UCS-2 builds.
416 encoded = b'\x00\x00\x01\x00' * 1024
417 self.assertEqual('\U00010000' * 1024,
418 codecs.utf_32_le_decode(encoded)[0])
419
Walter Dörwald41980ca2007-08-16 21:55:45 +0000420class UTF32BETest(ReadTest):
421 encoding = "utf-32-be"
422
423 def test_partial(self):
424 self.check_partial(
425 "\x00\xff\u0100\uffff",
426 [
427 "",
428 "",
429 "",
430 "\x00",
431 "\x00",
432 "\x00",
433 "\x00",
434 "\x00\xff",
435 "\x00\xff",
436 "\x00\xff",
437 "\x00\xff",
438 "\x00\xff\u0100",
439 "\x00\xff\u0100",
440 "\x00\xff\u0100",
441 "\x00\xff\u0100",
442 "\x00\xff\u0100\uffff",
443 ]
444 )
445
446 def test_simple(self):
447 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
448
449 def test_errors(self):
450 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
451 b"\xff", "strict", True)
452
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000453 def test_issue8941(self):
454 # Issue #8941: insufficient result allocation when decoding into
455 # surrogate pairs on UCS-2 builds.
456 encoded = b'\x00\x01\x00\x00' * 1024
457 self.assertEqual('\U00010000' * 1024,
458 codecs.utf_32_be_decode(encoded)[0])
459
460
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000461class UTF16Test(ReadTest):
462 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000463
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000464 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
465 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000466
467 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000468 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000469 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000470 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200471 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000472 f.write("spam")
473 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000474 d = s.getvalue()
475 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000476 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000477 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000478 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200479 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000480 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000481
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000482 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000483 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200484 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000485 self.assertRaises(UnicodeError, f.read)
486
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000487 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200488 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000489 self.assertRaises(UnicodeError, f.read)
490
Walter Dörwald69652032004-09-07 20:24:22 +0000491 def test_partial(self):
492 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000493 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000494 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000495 "", # first byte of BOM read
496 "", # second byte of BOM read => byteorder known
497 "",
498 "\x00",
499 "\x00",
500 "\x00\xff",
501 "\x00\xff",
502 "\x00\xff\u0100",
503 "\x00\xff\u0100",
504 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000505 ]
506 )
507
Georg Brandl791f4e12009-09-17 11:41:24 +0000508 def test_handlers(self):
509 self.assertEqual(('\ufffd', 1),
510 codecs.utf_16_decode(b'\x01', 'replace', True))
511 self.assertEqual(('', 1),
512 codecs.utf_16_decode(b'\x01', 'ignore', True))
513
Walter Dörwalde22d3392005-11-17 08:52:34 +0000514 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000515 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000516 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000517
518 def test_decoder_state(self):
519 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000520 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000521 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000522 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000523
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000524 def test_bug691291(self):
525 # Files are always opened in binary mode, even if no binary mode was
526 # specified. This means that no automatic conversion of '\n' is done
527 # on reading and writing.
528 s1 = 'Hello\r\nworld\r\n'
529
530 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200531 self.addCleanup(support.unlink, support.TESTFN)
532 with open(support.TESTFN, 'wb') as fp:
533 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200534 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200535 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000536
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000537class UTF16LETest(ReadTest):
538 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000539
540 def test_partial(self):
541 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000542 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000543 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000544 "",
545 "\x00",
546 "\x00",
547 "\x00\xff",
548 "\x00\xff",
549 "\x00\xff\u0100",
550 "\x00\xff\u0100",
551 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000552 ]
553 )
554
Walter Dörwalde22d3392005-11-17 08:52:34 +0000555 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000556 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000557 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000558
Victor Stinner53a9dd72010-12-08 22:25:45 +0000559 def test_nonbmp(self):
560 self.assertEqual("\U00010203".encode(self.encoding),
561 b'\x00\xd8\x03\xde')
562 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
563 "\U00010203")
564
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000565class UTF16BETest(ReadTest):
566 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000567
568 def test_partial(self):
569 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000570 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000571 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000572 "",
573 "\x00",
574 "\x00",
575 "\x00\xff",
576 "\x00\xff",
577 "\x00\xff\u0100",
578 "\x00\xff\u0100",
579 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000580 ]
581 )
582
Walter Dörwalde22d3392005-11-17 08:52:34 +0000583 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000584 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000585 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000586
Victor Stinner53a9dd72010-12-08 22:25:45 +0000587 def test_nonbmp(self):
588 self.assertEqual("\U00010203".encode(self.encoding),
589 b'\xd8\x00\xde\x03')
590 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
591 "\U00010203")
592
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000593class UTF8Test(ReadTest):
594 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000595
596 def test_partial(self):
597 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000598 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000600 "\x00",
601 "\x00",
602 "\x00\xff",
603 "\x00\xff",
604 "\x00\xff\u07ff",
605 "\x00\xff\u07ff",
606 "\x00\xff\u07ff",
607 "\x00\xff\u07ff\u0800",
608 "\x00\xff\u07ff\u0800",
609 "\x00\xff\u07ff\u0800",
610 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000611 ]
612 )
613
Walter Dörwald3abcb012007-04-16 22:10:50 +0000614 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000615 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000616 self.check_state_handling_decode(self.encoding,
617 u, u.encode(self.encoding))
618
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000619 def test_lone_surrogates(self):
620 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
621 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000622 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
623 b'[\\udc80]')
624 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
625 b'[&#56448;]')
626 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
627 b'[\x80]')
628 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
629 b'[]')
630 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
631 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000632
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000633 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000634 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
635 b"abc\xed\xa0\x80def")
636 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
637 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
639 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
640 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
641 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000642 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000643
Victor Stinner2f3ca9f2011-10-27 01:38:56 +0200644@unittest.skipUnless(sys.platform == 'win32',
645 'cp65001 is a Windows-only codec')
646class CP65001Test(ReadTest):
647 encoding = "cp65001"
648
649 def test_encode(self):
650 tests = [
651 ('abc', 'strict', b'abc'),
652 ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
653 ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
654 ]
655 if VISTA_OR_LATER:
656 tests.extend((
657 ('\udc80', 'strict', None),
658 ('\udc80', 'ignore', b''),
659 ('\udc80', 'replace', b'?'),
660 ('\udc80', 'backslashreplace', b'\\udc80'),
661 ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
662 ))
663 else:
664 tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
665 for text, errors, expected in tests:
666 if expected is not None:
667 try:
668 encoded = text.encode('cp65001', errors)
669 except UnicodeEncodeError as err:
670 self.fail('Unable to encode %a to cp65001 with '
671 'errors=%r: %s' % (text, errors, err))
672 self.assertEqual(encoded, expected,
673 '%a.encode("cp65001", %r)=%a != %a'
674 % (text, errors, encoded, expected))
675 else:
676 self.assertRaises(UnicodeEncodeError,
677 text.encode, "cp65001", errors)
678
679 def test_decode(self):
680 tests = [
681 (b'abc', 'strict', 'abc'),
682 (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
683 (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
684 (b'\xef\xbf\xbd', 'strict', '\ufffd'),
685 (b'[\xc3\xa9]', 'strict', '[\xe9]'),
686 # invalid bytes
687 (b'[\xff]', 'strict', None),
688 (b'[\xff]', 'ignore', '[]'),
689 (b'[\xff]', 'replace', '[\ufffd]'),
690 (b'[\xff]', 'surrogateescape', '[\udcff]'),
691 ]
692 if VISTA_OR_LATER:
693 tests.extend((
694 (b'[\xed\xb2\x80]', 'strict', None),
695 (b'[\xed\xb2\x80]', 'ignore', '[]'),
696 (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
697 ))
698 else:
699 tests.extend((
700 (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
701 ))
702 for raw, errors, expected in tests:
703 if expected is not None:
704 try:
705 decoded = raw.decode('cp65001', errors)
706 except UnicodeDecodeError as err:
707 self.fail('Unable to decode %a from cp65001 with '
708 'errors=%r: %s' % (raw, errors, err))
709 self.assertEqual(decoded, expected,
710 '%a.decode("cp65001", %r)=%a != %a'
711 % (raw, errors, decoded, expected))
712 else:
713 self.assertRaises(UnicodeDecodeError,
714 raw.decode, 'cp65001', errors)
715
716 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
717 def test_lone_surrogates(self):
718 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
719 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
720 self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
721 b'[\\udc80]')
722 self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
723 b'[&#56448;]')
724 self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
725 b'[\x80]')
726 self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
727 b'[]')
728 self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
729 b'[?]')
730
731 @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
732 def test_surrogatepass_handler(self):
733 self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
734 b"abc\xed\xa0\x80def")
735 self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
736 "abc\ud800def")
737 self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
738 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
739 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
740 "\U00010fff\uD800")
741 self.assertTrue(codecs.lookup_error("surrogatepass"))
742
743
744
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745class UTF7Test(ReadTest):
746 encoding = "utf-7"
747
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000748 def test_partial(self):
749 self.check_partial(
750 "a+-b",
751 [
752 "a",
753 "a",
754 "a+",
755 "a+-",
756 "a+-b",
757 ]
758 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000759
760class UTF16ExTest(unittest.TestCase):
761
762 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000763 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000764
765 def test_bad_args(self):
766 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
767
768class ReadBufferTest(unittest.TestCase):
769
770 def test_array(self):
771 import array
772 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000773 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000774 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000775 )
776
777 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000778 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000779
780 def test_bad_args(self):
781 self.assertRaises(TypeError, codecs.readbuffer_encode)
782 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
783
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000784class UTF8SigTest(ReadTest):
785 encoding = "utf-8-sig"
786
787 def test_partial(self):
788 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000790 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000791 "",
792 "",
793 "", # First BOM has been read and skipped
794 "",
795 "",
796 "\ufeff", # Second BOM has been read and emitted
797 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000798 "\ufeff\x00", # First byte of encoded "\xff" read
799 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
800 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
801 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000802 "\ufeff\x00\xff\u07ff",
803 "\ufeff\x00\xff\u07ff",
804 "\ufeff\x00\xff\u07ff\u0800",
805 "\ufeff\x00\xff\u07ff\u0800",
806 "\ufeff\x00\xff\u07ff\u0800",
807 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000808 ]
809 )
810
Thomas Wouters89f507f2006-12-13 04:49:30 +0000811 def test_bug1601501(self):
812 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000813 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000814
Walter Dörwald3abcb012007-04-16 22:10:50 +0000815 def test_bom(self):
816 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000817 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000818 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
819
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000820 def test_stream_bom(self):
821 unistring = "ABC\u00A1\u2200XYZ"
822 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
823
824 reader = codecs.getreader("utf-8-sig")
825 for sizehint in [None] + list(range(1, 11)) + \
826 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200827 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000828 ostream = io.StringIO()
829 while 1:
830 if sizehint is not None:
831 data = istream.read(sizehint)
832 else:
833 data = istream.read()
834
835 if not data:
836 break
837 ostream.write(data)
838
839 got = ostream.getvalue()
840 self.assertEqual(got, unistring)
841
842 def test_stream_bare(self):
843 unistring = "ABC\u00A1\u2200XYZ"
844 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
845
846 reader = codecs.getreader("utf-8-sig")
847 for sizehint in [None] + list(range(1, 11)) + \
848 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200849 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000850 ostream = io.StringIO()
851 while 1:
852 if sizehint is not None:
853 data = istream.read(sizehint)
854 else:
855 data = istream.read()
856
857 if not data:
858 break
859 ostream.write(data)
860
861 got = ostream.getvalue()
862 self.assertEqual(got, unistring)
863
864class EscapeDecodeTest(unittest.TestCase):
865 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000866 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000867
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000868class RecodingTest(unittest.TestCase):
869 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000870 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200871 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000872 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000873 f2.close()
874 # Python used to crash on this at exit because of a refcount
875 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000876
Martin v. Löwis2548c732003-04-18 10:39:54 +0000877# From RFC 3492
878punycode_testcases = [
879 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000880 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
881 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000882 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000883 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000884 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000885 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000886 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000887 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000888 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000889 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000890 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
891 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
892 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000893 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000894 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000895 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
896 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
897 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000898 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000899 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000900 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000901 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
902 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
903 "\u0939\u0948\u0902",
904 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000905
906 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000907 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000908 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
909 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000910
911 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000912 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
913 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
914 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000915 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
916 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000917
918 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000919 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
920 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
921 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
922 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000923 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000924
925 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000926 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
927 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
928 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
929 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
930 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000931 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932
933 # (K) Vietnamese:
934 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
935 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000936 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
937 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
938 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
939 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000940 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000941
Martin v. Löwis2548c732003-04-18 10:39:54 +0000942 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000943 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000944 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000945
Martin v. Löwis2548c732003-04-18 10:39:54 +0000946 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000947 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
948 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
949 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000950 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000951
952 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000953 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
954 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
955 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000956 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000957
958 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000959 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000960 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000961
962 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000963 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
964 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000965 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000966
967 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000968 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000969 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970
971 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000972 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000973 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000974
975 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000976 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
977 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000978 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000979 ]
980
981for i in punycode_testcases:
982 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000983 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984
985class PunycodeTest(unittest.TestCase):
986 def test_encode(self):
987 for uni, puny in punycode_testcases:
988 # Need to convert both strings to lower case, since
989 # some of the extended encodings use upper case, but our
990 # code produces only lower case. Converting just puny to
991 # lower is also insufficient, since some of the input characters
992 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000993 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000994 str(uni.encode("punycode"), "ascii").lower(),
995 str(puny, "ascii").lower()
996 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997
998 def test_decode(self):
999 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001000 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +00001001 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001002 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001004class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +02001005 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001006 def test_bug1251300(self):
1007 # Decoding with unicode_internal used to not correctly handle "code
1008 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +02001009 ok = [
1010 (b"\x00\x10\xff\xff", "\U0010ffff"),
1011 (b"\x00\x00\x01\x01", "\U00000101"),
1012 (b"", ""),
1013 ]
1014 not_ok = [
1015 b"\x7f\xff\xff\xff",
1016 b"\x80\x00\x00\x00",
1017 b"\x81\x00\x00\x00",
1018 b"\x00",
1019 b"\x00\x00\x00\x00\x00",
1020 ]
1021 for internal, uni in ok:
1022 if sys.byteorder == "little":
1023 internal = bytes(reversed(internal))
1024 self.assertEqual(uni, internal.decode("unicode_internal"))
1025 for internal in not_ok:
1026 if sys.byteorder == "little":
1027 internal = bytes(reversed(internal))
1028 self.assertRaises(UnicodeDecodeError, internal.decode,
1029 "unicode_internal")
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001030
Victor Stinner182d90d2011-09-29 19:53:55 +02001031 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001032 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001033 try:
1034 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1035 except UnicodeDecodeError as ex:
1036 self.assertEqual("unicode_internal", ex.encoding)
1037 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1038 self.assertEqual(4, ex.start)
1039 self.assertEqual(8, ex.end)
1040 else:
1041 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001042
Victor Stinner182d90d2011-09-29 19:53:55 +02001043 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001044 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +02001045 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1046 decoder = codecs.getdecoder("unicode_internal")
1047 ab = "ab".encode("unicode_internal").decode()
1048 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1049 "ascii"),
1050 "UnicodeInternalTest")
1051 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001052
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001053 def test_encode_length(self):
1054 # Issue 3739
1055 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001056 self.assertEqual(encoder("a")[1], 1)
1057 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001058
Ezio Melottib3aedd42010-11-20 19:04:17 +00001059 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +00001060
Martin v. Löwis2548c732003-04-18 10:39:54 +00001061# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1062nameprep_tests = [
1063 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1065 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1066 b'\xb8\x8f\xef\xbb\xbf',
1067 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001069 (b'CAFE',
1070 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 # 3.3 Case folding 8bit U+00DF (german sharp s).
1072 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'\xc3\x9f',
1074 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001075 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'\xc4\xb0',
1077 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xc5\x83\xcd\xba',
1080 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001081 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1082 # XXX: skip this as it fails in UCS-2 mode
1083 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1084 # 'telc\xe2\x88\x95kg\xcf\x83'),
1085 (None, None),
1086 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001087 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1088 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001090 (b'\xe1\xbe\xb7',
1091 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001092 # 3.9 Self-reverting case folding U+01F0 and normalization.
1093 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001094 (b'\xc7\xb0',
1095 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001096 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001097 (b'\xce\x90',
1098 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001099 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001100 (b'\xce\xb0',
1101 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001102 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001103 (b'\xe1\xba\x96',
1104 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001105 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001106 (b'\xe1\xbd\x96',
1107 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001108 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001109 (b' ',
1110 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001111 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001112 (b'\xc2\xa0',
1113 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001114 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001115 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001116 None),
1117 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001118 (b'\xe2\x80\x80',
1119 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001120 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001121 (b'\xe2\x80\x8b',
1122 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001123 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001124 (b'\xe3\x80\x80',
1125 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001126 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001127 (b'\x10\x7f',
1128 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001129 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001130 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001131 None),
1132 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001133 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001134 None),
1135 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001136 (b'\xef\xbb\xbf',
1137 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001138 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001139 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001140 None),
1141 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001142 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001143 None),
1144 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001145 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001146 None),
1147 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001148 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001149 None),
1150 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001151 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 None),
1153 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001154 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001155 None),
1156 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001157 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001158 None),
1159 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001160 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001161 None),
1162 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001163 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001164 None),
1165 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001166 (b'\xcd\x81',
1167 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001168 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001169 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001170 None),
1171 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001172 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001173 None),
1174 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001175 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001176 None),
1177 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001178 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001179 None),
1180 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001181 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001182 None),
1183 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001184 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 None),
1186 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001187 (b'foo\xef\xb9\xb6bar',
1188 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001189 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001190 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001191 None),
1192 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001193 (b'\xd8\xa71\xd8\xa8',
1194 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001195 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001196 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001197 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001198 # None),
1199 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001200 # 3.44 Larger test (shrinking).
1201 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001202 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1203 b'\xaa\xce\xb0\xe2\x80\x80',
1204 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001205 # 3.45 Larger test (expanding).
1206 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001207 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1208 b'\x80',
1209 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1210 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1211 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001212 ]
1213
1214
1215class NameprepTest(unittest.TestCase):
1216 def test_nameprep(self):
1217 from encodings.idna import nameprep
1218 for pos, (orig, prepped) in enumerate(nameprep_tests):
1219 if orig is None:
1220 # Skipped
1221 continue
1222 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001223 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001224 if prepped is None:
1225 # Input contains prohibited characters
1226 self.assertRaises(UnicodeError, nameprep, orig)
1227 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001228 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001229 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001230 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001231 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001232 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001233
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001234class IDNACodecTest(unittest.TestCase):
1235 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001236 self.assertEqual(str(b"python.org", "idna"), "python.org")
1237 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1238 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1239 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001240
1241 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001242 self.assertEqual("python.org".encode("idna"), b"python.org")
1243 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1244 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1245 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001246
Martin v. Löwis8b595142005-08-25 11:03:38 +00001247 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001248 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001249 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001250 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001251
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001252 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001253 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001254 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001255 "python.org"
1256 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001257 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001258 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001259 "python.org."
1260 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001261 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001262 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001263 "pyth\xf6n.org."
1264 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001265 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001266 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001267 "pyth\xf6n.org."
1268 )
1269
1270 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001271 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1272 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1273 self.assertEqual(decoder.decode(b"rg"), "")
1274 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001275
1276 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001277 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1278 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1279 self.assertEqual(decoder.decode(b"rg."), "org.")
1280 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001281
1282 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001283 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001284 b"".join(codecs.iterencode("python.org", "idna")),
1285 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001286 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001287 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001288 b"".join(codecs.iterencode("python.org.", "idna")),
1289 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001290 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001291 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001292 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1293 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001294 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001295 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001296 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1297 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001298 )
1299
1300 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001301 self.assertEqual(encoder.encode("\xe4x"), b"")
1302 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1303 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001304
1305 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001306 self.assertEqual(encoder.encode("\xe4x"), b"")
1307 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1308 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001309
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001310class CodecsModuleTest(unittest.TestCase):
1311
1312 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001313 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1314 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001315 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001316 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001317 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001318
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001319 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001320 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1321 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001322 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001323 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001324 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001325 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001326
1327 def test_register(self):
1328 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001329 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001330
1331 def test_lookup(self):
1332 self.assertRaises(TypeError, codecs.lookup)
1333 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001334 self.assertRaises(LookupError, codecs.lookup, " ")
1335
1336 def test_getencoder(self):
1337 self.assertRaises(TypeError, codecs.getencoder)
1338 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1339
1340 def test_getdecoder(self):
1341 self.assertRaises(TypeError, codecs.getdecoder)
1342 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1343
1344 def test_getreader(self):
1345 self.assertRaises(TypeError, codecs.getreader)
1346 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1347
1348 def test_getwriter(self):
1349 self.assertRaises(TypeError, codecs.getwriter)
1350 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001351
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001352 def test_lookup_issue1813(self):
1353 # Issue #1813: under Turkish locales, lookup of some codecs failed
1354 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001355 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001356 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1357 try:
1358 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1359 except locale.Error:
1360 # Unsupported locale on this system
1361 self.skipTest('test needs Turkish locale')
1362 c = codecs.lookup('ASCII')
1363 self.assertEqual(c.name, 'ascii')
1364
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001365class StreamReaderTest(unittest.TestCase):
1366
1367 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001368 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001369 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001370
1371 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001372 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001373 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001374
Thomas Wouters89f507f2006-12-13 04:49:30 +00001375class EncodedFileTest(unittest.TestCase):
1376
1377 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001378 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001379 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001380 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001381
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001382 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001383 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001384 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001385 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001386
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001387all_unicode_encodings = [
1388 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001389 "big5",
1390 "big5hkscs",
1391 "charmap",
1392 "cp037",
1393 "cp1006",
1394 "cp1026",
1395 "cp1140",
1396 "cp1250",
1397 "cp1251",
1398 "cp1252",
1399 "cp1253",
1400 "cp1254",
1401 "cp1255",
1402 "cp1256",
1403 "cp1257",
1404 "cp1258",
1405 "cp424",
1406 "cp437",
1407 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001408 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001409 "cp737",
1410 "cp775",
1411 "cp850",
1412 "cp852",
1413 "cp855",
1414 "cp856",
1415 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001416 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001417 "cp860",
1418 "cp861",
1419 "cp862",
1420 "cp863",
1421 "cp864",
1422 "cp865",
1423 "cp866",
1424 "cp869",
1425 "cp874",
1426 "cp875",
1427 "cp932",
1428 "cp949",
1429 "cp950",
1430 "euc_jis_2004",
1431 "euc_jisx0213",
1432 "euc_jp",
1433 "euc_kr",
1434 "gb18030",
1435 "gb2312",
1436 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001437 "hp_roman8",
1438 "hz",
1439 "idna",
1440 "iso2022_jp",
1441 "iso2022_jp_1",
1442 "iso2022_jp_2",
1443 "iso2022_jp_2004",
1444 "iso2022_jp_3",
1445 "iso2022_jp_ext",
1446 "iso2022_kr",
1447 "iso8859_1",
1448 "iso8859_10",
1449 "iso8859_11",
1450 "iso8859_13",
1451 "iso8859_14",
1452 "iso8859_15",
1453 "iso8859_16",
1454 "iso8859_2",
1455 "iso8859_3",
1456 "iso8859_4",
1457 "iso8859_5",
1458 "iso8859_6",
1459 "iso8859_7",
1460 "iso8859_8",
1461 "iso8859_9",
1462 "johab",
1463 "koi8_r",
1464 "koi8_u",
1465 "latin_1",
1466 "mac_cyrillic",
1467 "mac_greek",
1468 "mac_iceland",
1469 "mac_latin2",
1470 "mac_roman",
1471 "mac_turkish",
1472 "palmos",
1473 "ptcp154",
1474 "punycode",
1475 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001476 "shift_jis",
1477 "shift_jis_2004",
1478 "shift_jisx0213",
1479 "tis_620",
1480 "unicode_escape",
1481 "unicode_internal",
1482 "utf_16",
1483 "utf_16_be",
1484 "utf_16_le",
1485 "utf_7",
1486 "utf_8",
1487]
1488
1489if hasattr(codecs, "mbcs_encode"):
1490 all_unicode_encodings.append("mbcs")
1491
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001492# The following encoding is not tested, because it's not supposed
1493# to work:
1494# "undefined"
1495
1496# The following encodings don't work in stateful mode
1497broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001498 "punycode",
1499 "unicode_internal"
1500]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001501broken_incremental_coders = broken_unicode_with_streams + [
1502 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001503]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001504
Walter Dörwald3abcb012007-04-16 22:10:50 +00001505class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001506 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001507 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001508 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001509 name = codecs.lookup(encoding).name
1510 if encoding.endswith("_codec"):
1511 name += "_codec"
1512 elif encoding == "latin_1":
1513 name = "latin_1"
1514 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001515 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001516 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001517 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001518 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1519
1520 if encoding not in broken_unicode_with_streams:
1521 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001522 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001523 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001524 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001525 for c in s:
1526 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001527 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001528 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001529 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001530 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001531 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001532 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001533 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001534 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001535 decodedresult += reader.read()
1536 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1537
Thomas Wouters89f507f2006-12-13 04:49:30 +00001538 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001539 # check incremental decoder/encoder (fetched via the Python
1540 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001541 try:
1542 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001543 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001544 except LookupError: # no IncrementalEncoder
1545 pass
1546 else:
1547 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001548 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001549 for c in s:
1550 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001552 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001553 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001554 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001555 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001556 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001557 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1558
1559 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001560 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001561 for c in s:
1562 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001563 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001564 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001565 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001567 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001568 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001569 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1570
1571 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001572 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001573 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1574
1575 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001576 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1577 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001578
Victor Stinner554f3f02010-06-16 23:33:54 +00001579 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001580 # check incremental decoder/encoder with errors argument
1581 try:
1582 encoder = codecs.getincrementalencoder(encoding)("ignore")
1583 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1584 except LookupError: # no IncrementalEncoder
1585 pass
1586 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001587 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001588 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001589 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001590 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1591
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001592 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001593 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001594 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001595 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1596
Walter Dörwald729c31f2005-03-14 19:06:30 +00001597 def test_seek(self):
1598 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001599 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001600 for encoding in all_unicode_encodings:
1601 if encoding == "idna": # FIXME: See SF bug #1163178
1602 continue
1603 if encoding in broken_unicode_with_streams:
1604 continue
Victor Stinner05010702011-05-27 16:50:40 +02001605 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001606 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001607 # Test that calling seek resets the internal codec state and buffers
1608 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001609 data = reader.read()
1610 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001611
Walter Dörwalde22d3392005-11-17 08:52:34 +00001612 def test_bad_decode_args(self):
1613 for encoding in all_unicode_encodings:
1614 decoder = codecs.getdecoder(encoding)
1615 self.assertRaises(TypeError, decoder)
1616 if encoding not in ("idna", "punycode"):
1617 self.assertRaises(TypeError, decoder, 42)
1618
1619 def test_bad_encode_args(self):
1620 for encoding in all_unicode_encodings:
1621 encoder = codecs.getencoder(encoding)
1622 self.assertRaises(TypeError, encoder)
1623
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001624 def test_encoding_map_type_initialized(self):
1625 from encodings import cp1140
1626 # This used to crash, we are only verifying there's no crash.
1627 table_type = type(cp1140.encoding_table)
1628 self.assertEqual(table_type, table_type)
1629
Walter Dörwald3abcb012007-04-16 22:10:50 +00001630 def test_decoder_state(self):
1631 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001632 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001633 for encoding in all_unicode_encodings:
1634 if encoding not in broken_incremental_coders:
1635 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1636 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1637
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001638class CharmapTest(unittest.TestCase):
1639 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001640 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001641 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001642 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001643 )
1644
Ezio Melottib3aedd42010-11-20 19:04:17 +00001645 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001646 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001647 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001648 )
1649
Ezio Melottib3aedd42010-11-20 19:04:17 +00001650 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001651 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001652 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001653 )
1654
Ezio Melottib3aedd42010-11-20 19:04:17 +00001655 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001656 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001657 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001658 )
1659
Ezio Melottib3aedd42010-11-20 19:04:17 +00001660 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001661 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001662 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001663 )
1664
Guido van Rossum805365e2007-05-07 22:24:25 +00001665 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001666 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001667 codecs.charmap_decode(allbytes, "ignore", ""),
1668 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001669 )
1670
Thomas Wouters89f507f2006-12-13 04:49:30 +00001671class WithStmtTest(unittest.TestCase):
1672 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001673 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001674 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1675 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001676
1677 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001678 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001679 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001680 with codecs.StreamReaderWriter(f, info.streamreader,
1681 info.streamwriter, 'strict') as srw:
1682 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001683
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001684class TypesTest(unittest.TestCase):
1685 def test_decode_unicode(self):
1686 # Most decoders don't accept unicode input
1687 decoders = [
1688 codecs.utf_7_decode,
1689 codecs.utf_8_decode,
1690 codecs.utf_16_le_decode,
1691 codecs.utf_16_be_decode,
1692 codecs.utf_16_ex_decode,
1693 codecs.utf_32_decode,
1694 codecs.utf_32_le_decode,
1695 codecs.utf_32_be_decode,
1696 codecs.utf_32_ex_decode,
1697 codecs.latin_1_decode,
1698 codecs.ascii_decode,
1699 codecs.charmap_decode,
1700 ]
1701 if hasattr(codecs, "mbcs_decode"):
1702 decoders.append(codecs.mbcs_decode)
1703 for decoder in decoders:
1704 self.assertRaises(TypeError, decoder, "xxx")
1705
1706 def test_unicode_escape(self):
1707 # Escape-decoding an unicode string is supported ang gives the same
1708 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001709 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1710 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1711 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1712 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001713
Martin v. Löwis43c57782009-05-10 08:15:24 +00001714class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001715
1716 def test_utf8(self):
1717 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001718 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001719 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001720 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001721 b"foo\x80bar")
1722 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001723 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001724 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001725 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001726 b"\xed\xb0\x80")
1727
1728 def test_ascii(self):
1729 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001730 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001731 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001732 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001733 b"foo\x80bar")
1734
1735 def test_charmap(self):
1736 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001737 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001738 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001739 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001740 b"foo\xa5bar")
1741
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001742 def test_latin1(self):
1743 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001744 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001745 b"\xe4\xeb\xef\xf6\xfc")
1746
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001747
Victor Stinner3fed0872010-05-22 02:16:27 +00001748class BomTest(unittest.TestCase):
1749 def test_seek0(self):
1750 data = "1234567890"
1751 tests = ("utf-16",
1752 "utf-16-le",
1753 "utf-16-be",
1754 "utf-32",
1755 "utf-32-le",
1756 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001757 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001758 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001759 # Check if the BOM is written only once
1760 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001761 f.write(data)
1762 f.write(data)
1763 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001764 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001765 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001766 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001767
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001768 # Check that the BOM is written after a seek(0)
1769 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1770 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001771 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001772 f.seek(0)
1773 f.write(data)
1774 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001775 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001776
1777 # (StreamWriter) Check that the BOM is written after a seek(0)
1778 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001779 f.writer.write(data[0])
1780 self.assertNotEqual(f.writer.tell(), 0)
1781 f.writer.seek(0)
1782 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001783 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001784 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001785
Victor Stinner05010702011-05-27 16:50:40 +02001786 # Check that the BOM is not written after a seek() at a position
1787 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001788 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1789 f.write(data)
1790 f.seek(f.tell())
1791 f.write(data)
1792 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001793 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001794
Victor Stinner05010702011-05-27 16:50:40 +02001795 # (StreamWriter) Check that the BOM is not written after a seek()
1796 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001797 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001798 f.writer.write(data)
1799 f.writer.seek(f.writer.tell())
1800 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001801 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001802 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001803
Victor Stinner3fed0872010-05-22 02:16:27 +00001804
Georg Brandl02524622010-12-02 18:06:51 +00001805bytes_transform_encodings = [
1806 "base64_codec",
1807 "uu_codec",
1808 "quopri_codec",
1809 "hex_codec",
1810]
1811try:
1812 import zlib
1813except ImportError:
1814 pass
1815else:
1816 bytes_transform_encodings.append("zlib_codec")
1817try:
1818 import bz2
1819except ImportError:
1820 pass
1821else:
1822 bytes_transform_encodings.append("bz2_codec")
1823
1824class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001825
Georg Brandl02524622010-12-02 18:06:51 +00001826 def test_basics(self):
1827 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001828 for encoding in bytes_transform_encodings:
1829 # generic codecs interface
1830 (o, size) = codecs.getencoder(encoding)(binput)
1831 self.assertEqual(size, len(binput))
1832 (i, size) = codecs.getdecoder(encoding)(o)
1833 self.assertEqual(size, len(o))
1834 self.assertEqual(i, binput)
1835
Georg Brandl02524622010-12-02 18:06:51 +00001836 def test_read(self):
1837 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001838 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001839 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001840 sout = reader.read()
1841 self.assertEqual(sout, b"\x80")
1842
1843 def test_readline(self):
1844 for encoding in bytes_transform_encodings:
1845 if encoding in ['uu_codec', 'zlib_codec']:
1846 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001847 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001848 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001849 sout = reader.readline()
1850 self.assertEqual(sout, b"\x80")
1851
1852
Victor Stinner62be4fb2011-10-18 21:46:37 +02001853@unittest.skipUnless(sys.platform == 'win32',
1854 'code pages are specific to Windows')
Victor Stinner3a50e702011-10-18 21:21:00 +02001855class CodePageTest(unittest.TestCase):
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001856 # CP_UTF8 is already tested by CP65001Test
Victor Stinner3a50e702011-10-18 21:21:00 +02001857 CP_UTF8 = 65001
Victor Stinner62be4fb2011-10-18 21:46:37 +02001858
Victor Stinner3a50e702011-10-18 21:21:00 +02001859 def test_invalid_code_page(self):
1860 self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
1861 self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
1862 self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
1863 self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
1864
1865 def test_code_page_name(self):
1866 self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
1867 codecs.code_page_encode, 932, '\xff')
1868 self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
1869 codecs.code_page_decode, 932, b'\x81\x00')
1870 self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
1871 codecs.code_page_decode, self.CP_UTF8, b'\xff')
1872
1873 def check_decode(self, cp, tests):
1874 for raw, errors, expected in tests:
1875 if expected is not None:
1876 try:
1877 decoded = codecs.code_page_decode(cp, raw, errors)
1878 except UnicodeDecodeError as err:
1879 self.fail('Unable to decode %a from "cp%s" with '
1880 'errors=%r: %s' % (raw, cp, errors, err))
1881 self.assertEqual(decoded[0], expected,
1882 '%a.decode("cp%s", %r)=%a != %a'
1883 % (raw, cp, errors, decoded[0], expected))
1884 # assert 0 <= decoded[1] <= len(raw)
1885 self.assertGreaterEqual(decoded[1], 0)
1886 self.assertLessEqual(decoded[1], len(raw))
1887 else:
1888 self.assertRaises(UnicodeDecodeError,
1889 codecs.code_page_decode, cp, raw, errors)
1890
1891 def check_encode(self, cp, tests):
1892 for text, errors, expected in tests:
1893 if expected is not None:
1894 try:
1895 encoded = codecs.code_page_encode(cp, text, errors)
1896 except UnicodeEncodeError as err:
1897 self.fail('Unable to encode %a to "cp%s" with '
1898 'errors=%r: %s' % (text, cp, errors, err))
1899 self.assertEqual(encoded[0], expected,
1900 '%a.encode("cp%s", %r)=%a != %a'
1901 % (text, cp, errors, encoded[0], expected))
1902 self.assertEqual(encoded[1], len(text))
1903 else:
1904 self.assertRaises(UnicodeEncodeError,
1905 codecs.code_page_encode, cp, text, errors)
1906
1907 def test_cp932(self):
1908 self.check_encode(932, (
1909 ('abc', 'strict', b'abc'),
1910 ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001911 # test error handlers
Victor Stinner3a50e702011-10-18 21:21:00 +02001912 ('\xff', 'strict', None),
1913 ('[\xff]', 'ignore', b'[]'),
1914 ('[\xff]', 'replace', b'[y]'),
1915 ('[\u20ac]', 'replace', b'[?]'),
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001916 ('[\xff]', 'backslashreplace', b'[\\xff]'),
1917 ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02001918 ))
Victor Stinner9e921882011-10-18 21:55:25 +02001919 self.check_decode(932, (
Victor Stinner3a50e702011-10-18 21:21:00 +02001920 (b'abc', 'strict', 'abc'),
1921 (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
1922 # invalid bytes
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001923 (b'[\xff]', 'strict', None),
1924 (b'[\xff]', 'ignore', '[]'),
1925 (b'[\xff]', 'replace', '[\ufffd]'),
1926 (b'[\xff]', 'surrogateescape', '[\udcff]'),
Victor Stinner3a50e702011-10-18 21:21:00 +02001927 (b'\x81\x00abc', 'strict', None),
1928 (b'\x81\x00abc', 'ignore', '\x00abc'),
Victor Stinner9e921882011-10-18 21:55:25 +02001929 (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
1930 ))
Victor Stinner3a50e702011-10-18 21:21:00 +02001931
1932 def test_cp1252(self):
1933 self.check_encode(1252, (
1934 ('abc', 'strict', b'abc'),
1935 ('\xe9\u20ac', 'strict', b'\xe9\x80'),
1936 ('\xff', 'strict', b'\xff'),
1937 ('\u0141', 'strict', None),
1938 ('\u0141', 'ignore', b''),
1939 ('\u0141', 'replace', b'L'),
1940 ))
1941 self.check_decode(1252, (
1942 (b'abc', 'strict', 'abc'),
1943 (b'\xe9\x80', 'strict', '\xe9\u20ac'),
1944 (b'\xff', 'strict', '\xff'),
1945 ))
1946
1947 def test_cp_utf7(self):
1948 cp = 65000
1949 self.check_encode(cp, (
1950 ('abc', 'strict', b'abc'),
1951 ('\xe9\u20ac', 'strict', b'+AOkgrA-'),
1952 ('\U0010ffff', 'strict', b'+2//f/w-'),
1953 ('\udc80', 'strict', b'+3IA-'),
1954 ('\ufffd', 'strict', b'+//0-'),
1955 ))
1956 self.check_decode(cp, (
1957 (b'abc', 'strict', 'abc'),
1958 (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
1959 (b'+2//f/w-', 'strict', '\U0010ffff'),
1960 (b'+3IA-', 'strict', '\udc80'),
1961 (b'+//0-', 'strict', '\ufffd'),
1962 # invalid bytes
1963 (b'[+/]', 'strict', '[]'),
1964 (b'[\xff]', 'strict', '[\xff]'),
1965 ))
1966
Victor Stinner3a50e702011-10-18 21:21:00 +02001967 def test_multibyte_encoding(self):
1968 self.check_decode(932, (
1969 (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
1970 (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
1971 ))
1972 self.check_decode(self.CP_UTF8, (
1973 (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
1974 (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
1975 ))
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02001976 if VISTA_OR_LATER:
Victor Stinner3a50e702011-10-18 21:21:00 +02001977 self.check_encode(self.CP_UTF8, (
1978 ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
1979 ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
1980 ))
1981
1982 def test_incremental(self):
1983 decoded = codecs.code_page_decode(932,
1984 b'\xe9\x80\xe9', 'strict',
1985 False)
1986 self.assertEqual(decoded, ('\u9a3e', 2))
1987
1988 decoded = codecs.code_page_decode(932,
1989 b'\xe9\x80\xe9\x80', 'strict',
1990 False)
1991 self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
1992
1993 decoded = codecs.code_page_decode(932,
1994 b'abc', 'strict',
1995 False)
1996 self.assertEqual(decoded, ('abc', 3))
1997
1998
Fred Drake2e2be372001-09-20 21:33:42 +00001999def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00002000 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00002001 UTF32Test,
2002 UTF32LETest,
2003 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002004 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002005 UTF16LETest,
2006 UTF16BETest,
2007 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002008 UTF8SigTest,
Victor Stinner2f3ca9f2011-10-27 01:38:56 +02002009 CP65001Test,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002010 UTF7Test,
2011 UTF16ExTest,
2012 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002013 RecodingTest,
2014 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002015 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002016 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002017 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002018 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002019 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002020 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002021 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002022 CharmapTest,
2023 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00002024 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00002025 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00002026 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00002027 TransformCodecTest,
Victor Stinner3a50e702011-10-18 21:21:00 +02002028 CodePageTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002029 )
Fred Drake2e2be372001-09-20 21:33:42 +00002030
2031
2032if __name__ == "__main__":
2033 test_main()