blob: f70ae3364a193f30b315019fbe97800d9f7d7ea4 [file] [log] [blame]
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001from test import support
Victor Stinner98fe1a02011-05-27 01:51:18 +02002import unittest
Victor Stinner05010702011-05-27 16:50:40 +02003import codecs
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02004import locale
Victor Stinner05010702011-05-27 16:50:40 +02005import sys, _testcapi, io
Victor Stinner182d90d2011-09-29 19:53:55 +02006import ctypes
7
8SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
Marc-André Lemburga37171d2001-06-19 20:09:28 +00009
Walter Dörwald69652032004-09-07 20:24:22 +000010class Queue(object):
11 """
12 queue: write bytes at one end, read bytes from the other end
13 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000014 def __init__(self, buffer):
15 self._buffer = buffer
Walter Dörwald69652032004-09-07 20:24:22 +000016
17 def write(self, chars):
18 self._buffer += chars
19
20 def read(self, size=-1):
21 if size<0:
22 s = self._buffer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000023 self._buffer = self._buffer[:0] # make empty
Walter Dörwald69652032004-09-07 20:24:22 +000024 return s
25 else:
26 s = self._buffer[:size]
27 self._buffer = self._buffer[size:]
28 return s
29
Walter Dörwald3abcb012007-04-16 22:10:50 +000030class MixInCheckStateHandling:
31 def check_state_handling_decode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000032 for i in range(len(s)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000033 d = codecs.getincrementaldecoder(encoding)()
34 part1 = d.decode(s[:i])
35 state = d.getstate()
Ezio Melottie9615932010-01-24 19:26:24 +000036 self.assertIsInstance(state[1], int)
Walter Dörwald3abcb012007-04-16 22:10:50 +000037 # Check that the condition stated in the documentation for
38 # IncrementalDecoder.getstate() holds
39 if not state[1]:
40 # reset decoder to the default state without anything buffered
41 d.setstate((state[0][:0], 0))
42 # Feeding the previous input may not produce any output
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000043 self.assertTrue(not d.decode(state[0]))
Walter Dörwald3abcb012007-04-16 22:10:50 +000044 # The decoder must return to the same state
45 self.assertEqual(state, d.getstate())
46 # Create a new decoder and set it to the state
47 # we extracted from the old one
48 d = codecs.getincrementaldecoder(encoding)()
49 d.setstate(state)
50 part2 = d.decode(s[i:], True)
51 self.assertEqual(u, part1+part2)
52
53 def check_state_handling_encode(self, encoding, u, s):
Guido van Rossum805365e2007-05-07 22:24:25 +000054 for i in range(len(u)+1):
Walter Dörwald3abcb012007-04-16 22:10:50 +000055 d = codecs.getincrementalencoder(encoding)()
56 part1 = d.encode(u[:i])
57 state = d.getstate()
58 d = codecs.getincrementalencoder(encoding)()
59 d.setstate(state)
60 part2 = d.encode(u[i:], True)
61 self.assertEqual(s, part1+part2)
62
63class ReadTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000064 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000065 # get a StreamReader for the encoding and feed the bytestring version
Guido van Rossum87c0f1d2007-11-19 18:03:44 +000066 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000067 # the StreamReader and check that the results equal the appropriate
68 # entries from partialresults.
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000069 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +020070 r = codecs.getreader(self.encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +000071 result = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000072 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000073 q.write(bytes([c]))
Walter Dörwald69652032004-09-07 20:24:22 +000074 result += r.read()
75 self.assertEqual(result, partialresult)
76 # check that there's nothing left in the buffers
Guido van Rossumef87d6e2007-05-02 19:09:54 +000077 self.assertEqual(r.read(), "")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000078 self.assertEqual(r.bytebuffer, b"")
Walter Dörwald69652032004-09-07 20:24:22 +000079
Thomas Woutersa9773292006-04-21 09:43:23 +000080 # do the check again, this time using a incremental decoder
81 d = codecs.getincrementaldecoder(self.encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000082 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000083 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000084 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000085 self.assertEqual(result, partialresult)
86 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000087 self.assertEqual(d.decode(b"", True), "")
88 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000089
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000090 # Check whether the reset method works properly
Thomas Woutersa9773292006-04-21 09:43:23 +000091 d.reset()
Guido van Rossumef87d6e2007-05-02 19:09:54 +000092 result = ""
Thomas Woutersa9773292006-04-21 09:43:23 +000093 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000094 result += d.decode(bytes([c]))
Thomas Woutersa9773292006-04-21 09:43:23 +000095 self.assertEqual(result, partialresult)
96 # check that there's nothing left in the buffers
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000097 self.assertEqual(d.decode(b"", True), "")
98 self.assertEqual(d.buffer, b"")
Thomas Woutersa9773292006-04-21 09:43:23 +000099
100 # check iterdecode()
101 encoded = input.encode(self.encoding)
102 self.assertEqual(
103 input,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000104 "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 )
106
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000107 def test_readline(self):
108 def getreader(input):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000109 stream = io.BytesIO(input.encode(self.encoding))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000110 return codecs.getreader(self.encoding)(stream)
111
Walter Dörwaldca199432006-03-06 22:39:12 +0000112 def readalllines(input, keepends=True, size=None):
Victor Stinner05010702011-05-27 16:50:40 +0200113 reader = getreader(input)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000114 lines = []
115 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +0000116 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000117 if not line:
118 break
119 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +0000120 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000121
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000122 s = "foo\nbar\r\nbaz\rspam\u2028eggs"
123 sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
124 sexpectednoends = "foo|bar|baz|spam|eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +0000125 self.assertEqual(readalllines(s, True), sexpected)
126 self.assertEqual(readalllines(s, False), sexpectednoends)
127 self.assertEqual(readalllines(s, True, 10), sexpected)
128 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000129
130 # Test long lines (multiple calls to read() in readline())
131 vw = []
132 vwo = []
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000133 for (i, lineend) in enumerate("\n \r\n \r \u2028".split()):
134 vw.append((i*200)*"\3042" + lineend)
135 vwo.append((i*200)*"\3042")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000136 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
137 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
138
139 # Test lines where the first read might end with \r, so the
140 # reader has to look ahead whether this is a lone \r or a \r\n
Guido van Rossum805365e2007-05-07 22:24:25 +0000141 for size in range(80):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000142 for lineend in "\n \r\n \r \u2028".split():
143 s = 10*(size*"a" + lineend + "xxx\n")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000144 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000145 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000146 self.assertEqual(
147 reader.readline(keepends=True),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000148 size*"a" + lineend,
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000149 )
150 reader = getreader(s)
Guido van Rossum805365e2007-05-07 22:24:25 +0000151 for i in range(10):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000152 self.assertEqual(
153 reader.readline(keepends=False),
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000154 size*"a",
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000155 )
156
157 def test_bug1175396(self):
158 s = [
159 '<%!--===================================================\r\n',
160 ' BLOG index page: show recent articles,\r\n',
161 ' today\'s articles, or articles of a specific date.\r\n',
162 '========================================================--%>\r\n',
163 '<%@inputencoding="ISO-8859-1"%>\r\n',
164 '<%@pagetemplate=TEMPLATE.y%>\r\n',
165 '<%@import=import frog.util, frog%>\r\n',
166 '<%@import=import frog.objects%>\r\n',
167 '<%@import=from frog.storageerrors import StorageError%>\r\n',
168 '<%\r\n',
169 '\r\n',
170 'import logging\r\n',
171 'log=logging.getLogger("Snakelets.logger")\r\n',
172 '\r\n',
173 '\r\n',
174 'user=self.SessionCtx.user\r\n',
175 'storageEngine=self.SessionCtx.storageEngine\r\n',
176 '\r\n',
177 '\r\n',
178 'def readArticlesFromDate(date, count=None):\r\n',
179 ' entryids=storageEngine.listBlogEntries(date)\r\n',
180 ' entryids.reverse() # descending\r\n',
181 ' if count:\r\n',
182 ' entryids=entryids[:count]\r\n',
183 ' try:\r\n',
184 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
185 ' except StorageError,x:\r\n',
186 ' log.error("Error loading articles: "+str(x))\r\n',
187 ' self.abort("cannot load articles")\r\n',
188 '\r\n',
189 'showdate=None\r\n',
190 '\r\n',
191 'arg=self.Request.getArg()\r\n',
192 'if arg=="today":\r\n',
193 ' #-------------------- TODAY\'S ARTICLES\r\n',
194 ' self.write("<h2>Today\'s articles</h2>")\r\n',
195 ' showdate = frog.util.isodatestr() \r\n',
196 ' entries = readArticlesFromDate(showdate)\r\n',
197 'elif arg=="active":\r\n',
198 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
199 ' self.Yredirect("active.y")\r\n',
200 'elif arg=="login":\r\n',
201 ' #-------------------- LOGIN PAGE redirect\r\n',
202 ' self.Yredirect("login.y")\r\n',
203 'elif arg=="date":\r\n',
204 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
205 ' showdate = self.Request.getParameter("date")\r\n',
206 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
207 ' entries = readArticlesFromDate(showdate)\r\n',
208 'else:\r\n',
209 ' #-------------------- RECENT ARTICLES\r\n',
210 ' self.write("<h2>Recent articles</h2>")\r\n',
211 ' dates=storageEngine.listBlogEntryDates()\r\n',
212 ' if dates:\r\n',
213 ' entries=[]\r\n',
214 ' SHOWAMOUNT=10\r\n',
215 ' for showdate in dates:\r\n',
216 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
217 ' if len(entries)>=SHOWAMOUNT:\r\n',
218 ' break\r\n',
219 ' \r\n',
220 ]
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000221 stream = io.BytesIO("".join(s).encode(self.encoding))
Victor Stinner05010702011-05-27 16:50:40 +0200222 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000223 for (i, line) in enumerate(reader):
224 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000225
226 def test_readlinequeue(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000227 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +0200228 writer = codecs.getwriter(self.encoding)(q)
229 reader = codecs.getreader(self.encoding)(q)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000230
231 # No lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000232 writer.write("foo\r")
233 self.assertEqual(reader.readline(keepends=False), "foo")
234 writer.write("\nbar\r")
235 self.assertEqual(reader.readline(keepends=False), "")
236 self.assertEqual(reader.readline(keepends=False), "bar")
237 writer.write("baz")
238 self.assertEqual(reader.readline(keepends=False), "baz")
239 self.assertEqual(reader.readline(keepends=False), "")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000240
241 # Lineends
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000242 writer.write("foo\r")
243 self.assertEqual(reader.readline(keepends=True), "foo\r")
244 writer.write("\nbar\r")
245 self.assertEqual(reader.readline(keepends=True), "\n")
246 self.assertEqual(reader.readline(keepends=True), "bar\r")
247 writer.write("baz")
248 self.assertEqual(reader.readline(keepends=True), "baz")
249 self.assertEqual(reader.readline(keepends=True), "")
250 writer.write("foo\r\n")
251 self.assertEqual(reader.readline(keepends=True), "foo\r\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000252
Walter Dörwald9fa09462005-01-10 12:01:39 +0000253 def test_bug1098990_a(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000254 s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
255 s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
256 s3 = "next line.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000257
258 s = (s1+s2+s3).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000259 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200260 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000261 self.assertEqual(reader.readline(), s1)
262 self.assertEqual(reader.readline(), s2)
263 self.assertEqual(reader.readline(), s3)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000264 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000265
266 def test_bug1098990_b(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000267 s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
268 s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
269 s3 = "stillokay:bbbbxx\r\n"
270 s4 = "broken!!!!badbad\r\n"
271 s5 = "againokay.\r\n"
Walter Dörwald9fa09462005-01-10 12:01:39 +0000272
273 s = (s1+s2+s3+s4+s5).encode(self.encoding)
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000274 stream = io.BytesIO(s)
Victor Stinner05010702011-05-27 16:50:40 +0200275 reader = codecs.getreader(self.encoding)(stream)
Walter Dörwald9fa09462005-01-10 12:01:39 +0000276 self.assertEqual(reader.readline(), s1)
277 self.assertEqual(reader.readline(), s2)
278 self.assertEqual(reader.readline(), s3)
279 self.assertEqual(reader.readline(), s4)
280 self.assertEqual(reader.readline(), s5)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000281 self.assertEqual(reader.readline(), "")
Walter Dörwald9fa09462005-01-10 12:01:39 +0000282
Walter Dörwald41980ca2007-08-16 21:55:45 +0000283class UTF32Test(ReadTest):
284 encoding = "utf-32"
285
286 spamle = (b'\xff\xfe\x00\x00'
287 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
288 b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
289 spambe = (b'\x00\x00\xfe\xff'
290 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
291 b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
292
293 def test_only_one_bom(self):
294 _,_,reader,writer = codecs.lookup(self.encoding)
295 # encode some stream
296 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200297 f = writer(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000298 f.write("spam")
299 f.write("spam")
300 d = s.getvalue()
301 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000302 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000303 # try to read it back
304 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200305 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000306 self.assertEqual(f.read(), "spamspam")
Walter Dörwald41980ca2007-08-16 21:55:45 +0000307
308 def test_badbom(self):
309 s = io.BytesIO(4*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200310 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000311 self.assertRaises(UnicodeError, f.read)
312
313 s = io.BytesIO(8*b"\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200314 f = codecs.getreader(self.encoding)(s)
Walter Dörwald41980ca2007-08-16 21:55:45 +0000315 self.assertRaises(UnicodeError, f.read)
316
317 def test_partial(self):
318 self.check_partial(
319 "\x00\xff\u0100\uffff",
320 [
321 "", # first byte of BOM read
322 "", # second byte of BOM read
323 "", # third byte of BOM read
324 "", # fourth byte of BOM read => byteorder known
325 "",
326 "",
327 "",
328 "\x00",
329 "\x00",
330 "\x00",
331 "\x00",
332 "\x00\xff",
333 "\x00\xff",
334 "\x00\xff",
335 "\x00\xff",
336 "\x00\xff\u0100",
337 "\x00\xff\u0100",
338 "\x00\xff\u0100",
339 "\x00\xff\u0100",
340 "\x00\xff\u0100\uffff",
341 ]
342 )
343
Georg Brandl791f4e12009-09-17 11:41:24 +0000344 def test_handlers(self):
345 self.assertEqual(('\ufffd', 1),
346 codecs.utf_32_decode(b'\x01', 'replace', True))
347 self.assertEqual(('', 1),
348 codecs.utf_32_decode(b'\x01', 'ignore', True))
349
Walter Dörwald41980ca2007-08-16 21:55:45 +0000350 def test_errors(self):
351 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
352 b"\xff", "strict", True)
353
354 def test_decoder_state(self):
355 self.check_state_handling_decode(self.encoding,
356 "spamspam", self.spamle)
357 self.check_state_handling_decode(self.encoding,
358 "spamspam", self.spambe)
359
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000360 def test_issue8941(self):
361 # Issue #8941: insufficient result allocation when decoding into
362 # surrogate pairs on UCS-2 builds.
363 encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
364 self.assertEqual('\U00010000' * 1024,
365 codecs.utf_32_decode(encoded_le)[0])
366 encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
367 self.assertEqual('\U00010000' * 1024,
368 codecs.utf_32_decode(encoded_be)[0])
369
Walter Dörwald41980ca2007-08-16 21:55:45 +0000370class UTF32LETest(ReadTest):
371 encoding = "utf-32-le"
372
373 def test_partial(self):
374 self.check_partial(
375 "\x00\xff\u0100\uffff",
376 [
377 "",
378 "",
379 "",
380 "\x00",
381 "\x00",
382 "\x00",
383 "\x00",
384 "\x00\xff",
385 "\x00\xff",
386 "\x00\xff",
387 "\x00\xff",
388 "\x00\xff\u0100",
389 "\x00\xff\u0100",
390 "\x00\xff\u0100",
391 "\x00\xff\u0100",
392 "\x00\xff\u0100\uffff",
393 ]
394 )
395
396 def test_simple(self):
397 self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
398
399 def test_errors(self):
400 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
401 b"\xff", "strict", True)
402
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000403 def test_issue8941(self):
404 # Issue #8941: insufficient result allocation when decoding into
405 # surrogate pairs on UCS-2 builds.
406 encoded = b'\x00\x00\x01\x00' * 1024
407 self.assertEqual('\U00010000' * 1024,
408 codecs.utf_32_le_decode(encoded)[0])
409
Walter Dörwald41980ca2007-08-16 21:55:45 +0000410class UTF32BETest(ReadTest):
411 encoding = "utf-32-be"
412
413 def test_partial(self):
414 self.check_partial(
415 "\x00\xff\u0100\uffff",
416 [
417 "",
418 "",
419 "",
420 "\x00",
421 "\x00",
422 "\x00",
423 "\x00",
424 "\x00\xff",
425 "\x00\xff",
426 "\x00\xff",
427 "\x00\xff",
428 "\x00\xff\u0100",
429 "\x00\xff\u0100",
430 "\x00\xff\u0100",
431 "\x00\xff\u0100",
432 "\x00\xff\u0100\uffff",
433 ]
434 )
435
436 def test_simple(self):
437 self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
438
439 def test_errors(self):
440 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
441 b"\xff", "strict", True)
442
Antoine Pitroucc0cfd32010-06-11 21:46:32 +0000443 def test_issue8941(self):
444 # Issue #8941: insufficient result allocation when decoding into
445 # surrogate pairs on UCS-2 builds.
446 encoded = b'\x00\x01\x00\x00' * 1024
447 self.assertEqual('\U00010000' * 1024,
448 codecs.utf_32_be_decode(encoded)[0])
449
450
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000451class UTF16Test(ReadTest):
452 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000453
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000454 spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
455 spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000456
457 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000458 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000459 # encode some stream
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000460 s = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200461 f = writer(s)
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000462 f.write("spam")
463 f.write("spam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000464 d = s.getvalue()
465 # check whether there is exactly one BOM in it
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000466 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000467 # try to read it back
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000468 s = io.BytesIO(d)
Victor Stinner05010702011-05-27 16:50:40 +0200469 f = reader(s)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000470 self.assertEqual(f.read(), "spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000471
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000472 def test_badbom(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000473 s = io.BytesIO(b"\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200474 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000475 self.assertRaises(UnicodeError, f.read)
476
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +0000477 s = io.BytesIO(b"\xff\xff\xff\xff")
Victor Stinner05010702011-05-27 16:50:40 +0200478 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000479 self.assertRaises(UnicodeError, f.read)
480
Walter Dörwald69652032004-09-07 20:24:22 +0000481 def test_partial(self):
482 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000483 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000484 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000485 "", # first byte of BOM read
486 "", # second byte of BOM read => byteorder known
487 "",
488 "\x00",
489 "\x00",
490 "\x00\xff",
491 "\x00\xff",
492 "\x00\xff\u0100",
493 "\x00\xff\u0100",
494 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000495 ]
496 )
497
Georg Brandl791f4e12009-09-17 11:41:24 +0000498 def test_handlers(self):
499 self.assertEqual(('\ufffd', 1),
500 codecs.utf_16_decode(b'\x01', 'replace', True))
501 self.assertEqual(('', 1),
502 codecs.utf_16_decode(b'\x01', 'ignore', True))
503
Walter Dörwalde22d3392005-11-17 08:52:34 +0000504 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000505 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000506 b"\xff", "strict", True)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507
508 def test_decoder_state(self):
509 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000510 "spamspam", self.spamle)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000511 self.check_state_handling_decode(self.encoding,
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000512 "spamspam", self.spambe)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000513
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000514 def test_bug691291(self):
515 # Files are always opened in binary mode, even if no binary mode was
516 # specified. This means that no automatic conversion of '\n' is done
517 # on reading and writing.
518 s1 = 'Hello\r\nworld\r\n'
519
520 s = s1.encode(self.encoding)
Victor Stinner2cca0572011-05-23 14:51:42 +0200521 self.addCleanup(support.unlink, support.TESTFN)
522 with open(support.TESTFN, 'wb') as fp:
523 fp.write(s)
Victor Stinner05010702011-05-27 16:50:40 +0200524 with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
Victor Stinner2cca0572011-05-23 14:51:42 +0200525 self.assertEqual(reader.read(), s1)
Florent Xiclunac1c415f2010-02-26 11:12:33 +0000526
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000527class UTF16LETest(ReadTest):
528 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000529
530 def test_partial(self):
531 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000532 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000533 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 "",
535 "\x00",
536 "\x00",
537 "\x00\xff",
538 "\x00\xff",
539 "\x00\xff\u0100",
540 "\x00\xff\u0100",
541 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000542 ]
543 )
544
Walter Dörwalde22d3392005-11-17 08:52:34 +0000545 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000546 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000547 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000548
Victor Stinner53a9dd72010-12-08 22:25:45 +0000549 def test_nonbmp(self):
550 self.assertEqual("\U00010203".encode(self.encoding),
551 b'\x00\xd8\x03\xde')
552 self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
553 "\U00010203")
554
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000555class UTF16BETest(ReadTest):
556 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000557
558 def test_partial(self):
559 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000560 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000561 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000562 "",
563 "\x00",
564 "\x00",
565 "\x00\xff",
566 "\x00\xff",
567 "\x00\xff\u0100",
568 "\x00\xff\u0100",
569 "\x00\xff\u0100\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000570 ]
571 )
572
Walter Dörwalde22d3392005-11-17 08:52:34 +0000573 def test_errors(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000574 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000575 b"\xff", "strict", True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000576
Victor Stinner53a9dd72010-12-08 22:25:45 +0000577 def test_nonbmp(self):
578 self.assertEqual("\U00010203".encode(self.encoding),
579 b'\xd8\x00\xde\x03')
580 self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
581 "\U00010203")
582
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583class UTF8Test(ReadTest):
584 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000585
586 def test_partial(self):
587 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000588 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000589 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 "\x00",
591 "\x00",
592 "\x00\xff",
593 "\x00\xff",
594 "\x00\xff\u07ff",
595 "\x00\xff\u07ff",
596 "\x00\xff\u07ff",
597 "\x00\xff\u07ff\u0800",
598 "\x00\xff\u07ff\u0800",
599 "\x00\xff\u07ff\u0800",
600 "\x00\xff\u07ff\u0800\uffff",
Walter Dörwald69652032004-09-07 20:24:22 +0000601 ]
602 )
603
Walter Dörwald3abcb012007-04-16 22:10:50 +0000604 def test_decoder_state(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000605 u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000606 self.check_state_handling_decode(self.encoding,
607 u, u.encode(self.encoding))
608
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000609 def test_lone_surrogates(self):
610 self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
611 self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
Victor Stinner31be90b2010-04-22 19:38:16 +0000612 self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
613 b'[\\udc80]')
614 self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
615 b'[&#56448;]')
616 self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
617 b'[\x80]')
618 self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
619 b'[]')
620 self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
621 b'[?]')
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000622
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000623 def test_surrogatepass_handler(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000624 self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
625 b"abc\xed\xa0\x80def")
626 self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
627 "abc\ud800def")
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628 self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
629 b"\xf0\x90\xbf\xbf\xed\xa0\x80")
630 self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
631 "\U00010fff\uD800")
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000632 self.assertTrue(codecs.lookup_error("surrogatepass"))
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000633
Walter Dörwalde22d3392005-11-17 08:52:34 +0000634class UTF7Test(ReadTest):
635 encoding = "utf-7"
636
Christian Heimes5d14c2b2007-11-20 23:38:09 +0000637 def test_partial(self):
638 self.check_partial(
639 "a+-b",
640 [
641 "a",
642 "a",
643 "a+",
644 "a+-",
645 "a+-b",
646 ]
647 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000648
649class UTF16ExTest(unittest.TestCase):
650
651 def test_errors(self):
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000652 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000653
654 def test_bad_args(self):
655 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
656
657class ReadBufferTest(unittest.TestCase):
658
659 def test_array(self):
660 import array
661 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +0000662 codecs.readbuffer_encode(array.array("b", b"spam")),
Walter Dörwald2233d272007-06-22 12:17:08 +0000663 (b"spam", 4)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000664 )
665
666 def test_empty(self):
Walter Dörwald2233d272007-06-22 12:17:08 +0000667 self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
Walter Dörwalde22d3392005-11-17 08:52:34 +0000668
669 def test_bad_args(self):
670 self.assertRaises(TypeError, codecs.readbuffer_encode)
671 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
672
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000673class UTF8SigTest(ReadTest):
674 encoding = "utf-8-sig"
675
676 def test_partial(self):
677 self.check_partial(
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000678 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000679 [
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000680 "",
681 "",
682 "", # First BOM has been read and skipped
683 "",
684 "",
685 "\ufeff", # Second BOM has been read and emitted
686 "\ufeff\x00", # "\x00" read and emitted
Walter Dörwald32a4c712007-06-20 09:25:34 +0000687 "\ufeff\x00", # First byte of encoded "\xff" read
688 "\ufeff\x00\xff", # Second byte of encoded "\xff" read
689 "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
690 "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000691 "\ufeff\x00\xff\u07ff",
692 "\ufeff\x00\xff\u07ff",
693 "\ufeff\x00\xff\u07ff\u0800",
694 "\ufeff\x00\xff\u07ff\u0800",
695 "\ufeff\x00\xff\u07ff\u0800",
696 "\ufeff\x00\xff\u07ff\u0800\uffff",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000697 ]
698 )
699
Thomas Wouters89f507f2006-12-13 04:49:30 +0000700 def test_bug1601501(self):
701 # SF bug #1601501: check that the codec works with a buffer
Ezio Melottib3aedd42010-11-20 19:04:17 +0000702 self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000703
Walter Dörwald3abcb012007-04-16 22:10:50 +0000704 def test_bom(self):
705 d = codecs.getincrementaldecoder("utf-8-sig")()
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000706 s = "spam"
Walter Dörwald3abcb012007-04-16 22:10:50 +0000707 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
708
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000709 def test_stream_bom(self):
710 unistring = "ABC\u00A1\u2200XYZ"
711 bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
712
713 reader = codecs.getreader("utf-8-sig")
714 for sizehint in [None] + list(range(1, 11)) + \
715 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200716 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000717 ostream = io.StringIO()
718 while 1:
719 if sizehint is not None:
720 data = istream.read(sizehint)
721 else:
722 data = istream.read()
723
724 if not data:
725 break
726 ostream.write(data)
727
728 got = ostream.getvalue()
729 self.assertEqual(got, unistring)
730
731 def test_stream_bare(self):
732 unistring = "ABC\u00A1\u2200XYZ"
733 bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
734
735 reader = codecs.getreader("utf-8-sig")
736 for sizehint in [None] + list(range(1, 11)) + \
737 [64, 128, 256, 512, 1024]:
Victor Stinner05010702011-05-27 16:50:40 +0200738 istream = reader(io.BytesIO(bytestring))
Guido van Rossum87c0f1d2007-11-19 18:03:44 +0000739 ostream = io.StringIO()
740 while 1:
741 if sizehint is not None:
742 data = istream.read(sizehint)
743 else:
744 data = istream.read()
745
746 if not data:
747 break
748 ostream.write(data)
749
750 got = ostream.getvalue()
751 self.assertEqual(got, unistring)
752
753class EscapeDecodeTest(unittest.TestCase):
754 def test_empty(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +0000755 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald3abcb012007-04-16 22:10:50 +0000756
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000757class RecodingTest(unittest.TestCase):
758 def test_recoding(self):
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +0000759 f = io.BytesIO()
Victor Stinner05010702011-05-27 16:50:40 +0200760 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000761 f2.write("a")
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000762 f2.close()
763 # Python used to crash on this at exit because of a refcount
764 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000765
Martin v. Löwis2548c732003-04-18 10:39:54 +0000766# From RFC 3492
767punycode_testcases = [
768 # A Arabic (Egyptian):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000769 ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
770 "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000771 b"egbpdaj6bu4bxfgehfvwxn"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000772 # B Chinese (simplified):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000773 ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000774 b"ihqwcrb4cv8a8dqg056pqjye"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000775 # C Chinese (traditional):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000776 ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000777 b"ihqwctvzc91f659drss3x8bo0yb"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000778 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000779 ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
780 "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
781 "\u0065\u0073\u006B\u0079",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000782 b"Proprostnemluvesky-uyb24dma41a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000783 # E Hebrew:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000784 ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
785 "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
786 "\u05D1\u05E8\u05D9\u05EA",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000787 b"4dbcagdahymbxekheh6e0a7fei0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000788 # F Hindi (Devanagari):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000789 ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000790 "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
791 "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
792 "\u0939\u0948\u0902",
793 b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000794
795 #(G) Japanese (kanji and hiragana):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000796 ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
Walter Dörwalda4c61282007-05-10 12:36:25 +0000797 "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
798 b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000799
800 # (H) Korean (Hangul syllables):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000801 ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
802 "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
803 "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000804 b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
805 b"psd879ccm6fea98c"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000806
807 # (I) Russian (Cyrillic):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000808 ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
809 "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
810 "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
811 "\u0438",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000812 b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000813
814 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000815 ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
816 "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
817 "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
818 "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
819 "\u0061\u00F1\u006F\u006C",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000820 b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000821
822 # (K) Vietnamese:
823 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
824 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000825 ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
826 "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
827 "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
828 "\u0056\u0069\u1EC7\u0074",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000829 b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000830
Martin v. Löwis2548c732003-04-18 10:39:54 +0000831 #(L) 3<nen>B<gumi><kinpachi><sensei>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000832 ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000833 b"3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000834
Martin v. Löwis2548c732003-04-18 10:39:54 +0000835 # (M) <amuro><namie>-with-SUPER-MONKEYS
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000836 ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
837 "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
838 "\u004F\u004E\u004B\u0045\u0059\u0053",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000839 b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000840
841 # (N) Hello-Another-Way-<sorezore><no><basho>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000842 ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
843 "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
844 "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000845 b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000846
847 # (O) <hitotsu><yane><no><shita>2
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000848 ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000849 b"2-u9tlzr9756bt3uc0v"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000850
851 # (P) Maji<de>Koi<suru>5<byou><mae>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000852 ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
853 "\u308B\u0035\u79D2\u524D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000854 b"MajiKoi5-783gue6qz075azm5e"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000855
856 # (Q) <pafii>de<runba>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000857 ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000858 b"de-jg4avhby1noc0d"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000859
860 # (R) <sono><supiido><de>
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000861 ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000862 b"d9juau41awczczp"),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000863
864 # (S) -> $1.00 <-
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000865 ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
866 "\u003C\u002D",
Walter Dörwalda4c61282007-05-10 12:36:25 +0000867 b"-> $1.00 <--")
Martin v. Löwis2548c732003-04-18 10:39:54 +0000868 ]
869
870for i in punycode_testcases:
871 if len(i)!=2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000872 print(repr(i))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000873
874class PunycodeTest(unittest.TestCase):
875 def test_encode(self):
876 for uni, puny in punycode_testcases:
877 # Need to convert both strings to lower case, since
878 # some of the extended encodings use upper case, but our
879 # code produces only lower case. Converting just puny to
880 # lower is also insufficient, since some of the input characters
881 # are upper case.
Ezio Melottib3aedd42010-11-20 19:04:17 +0000882 self.assertEqual(
Walter Dörwalda4c61282007-05-10 12:36:25 +0000883 str(uni.encode("punycode"), "ascii").lower(),
884 str(puny, "ascii").lower()
885 )
Martin v. Löwis2548c732003-04-18 10:39:54 +0000886
887 def test_decode(self):
888 for uni, puny in punycode_testcases:
Ezio Melottib3aedd42010-11-20 19:04:17 +0000889 self.assertEqual(uni, puny.decode("punycode"))
Guido van Rossum04c70ad2007-08-29 14:04:40 +0000890 puny = puny.decode("ascii").encode("ascii")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000891 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000892
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000893class UnicodeInternalTest(unittest.TestCase):
Victor Stinner182d90d2011-09-29 19:53:55 +0200894 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000895 def test_bug1251300(self):
896 # Decoding with unicode_internal used to not correctly handle "code
897 # points" above 0x10ffff on UCS-4 builds.
Victor Stinner182d90d2011-09-29 19:53:55 +0200898 ok = [
899 (b"\x00\x10\xff\xff", "\U0010ffff"),
900 (b"\x00\x00\x01\x01", "\U00000101"),
901 (b"", ""),
902 ]
903 not_ok = [
904 b"\x7f\xff\xff\xff",
905 b"\x80\x00\x00\x00",
906 b"\x81\x00\x00\x00",
907 b"\x00",
908 b"\x00\x00\x00\x00\x00",
909 ]
910 for internal, uni in ok:
911 if sys.byteorder == "little":
912 internal = bytes(reversed(internal))
913 self.assertEqual(uni, internal.decode("unicode_internal"))
914 for internal in not_ok:
915 if sys.byteorder == "little":
916 internal = bytes(reversed(internal))
917 self.assertRaises(UnicodeDecodeError, internal.decode,
918 "unicode_internal")
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000919
Victor Stinner182d90d2011-09-29 19:53:55 +0200920 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000921 def test_decode_error_attributes(self):
Victor Stinner182d90d2011-09-29 19:53:55 +0200922 try:
923 b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
924 except UnicodeDecodeError as ex:
925 self.assertEqual("unicode_internal", ex.encoding)
926 self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
927 self.assertEqual(4, ex.start)
928 self.assertEqual(8, ex.end)
929 else:
930 self.fail()
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000931
Victor Stinner182d90d2011-09-29 19:53:55 +0200932 @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000933 def test_decode_callback(self):
Victor Stinner182d90d2011-09-29 19:53:55 +0200934 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
935 decoder = codecs.getdecoder("unicode_internal")
936 ab = "ab".encode("unicode_internal").decode()
937 ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
938 "ascii"),
939 "UnicodeInternalTest")
940 self.assertEqual(("ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000941
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000942 def test_encode_length(self):
943 # Issue 3739
944 encoder = codecs.getencoder("unicode_internal")
Ezio Melottib3aedd42010-11-20 19:04:17 +0000945 self.assertEqual(encoder("a")[1], 1)
946 self.assertEqual(encoder("\xe9\u0142")[1], 2)
Walter Dörwald8dc33d52009-05-06 14:41:26 +0000947
Ezio Melottib3aedd42010-11-20 19:04:17 +0000948 self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
Philip Jenvey66a1bd52010-04-05 03:05:24 +0000949
Martin v. Löwis2548c732003-04-18 10:39:54 +0000950# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
951nameprep_tests = [
952 # 3.1 Map to nothing.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000953 (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
954 b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
955 b'\xb8\x8f\xef\xbb\xbf',
956 b'foobarbaz'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000957 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000958 (b'CAFE',
959 b'cafe'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000960 # 3.3 Case folding 8bit U+00DF (german sharp s).
961 # The original test case is bogus; it says \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000962 (b'\xc3\x9f',
963 b'ss'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000964 # 3.4 Case folding U+0130 (turkish capital I with dot).
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000965 (b'\xc4\xb0',
966 b'i\xcc\x87'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000967 # 3.5 Case folding multibyte U+0143 U+037A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000968 (b'\xc5\x83\xcd\xba',
969 b'\xc5\x84 \xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000970 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
971 # XXX: skip this as it fails in UCS-2 mode
972 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
973 # 'telc\xe2\x88\x95kg\xcf\x83'),
974 (None, None),
975 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000976 (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
977 b'\xc7\xb0 a'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000978 # 3.8 Case folding U+1FB7 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000979 (b'\xe1\xbe\xb7',
980 b'\xe1\xbe\xb6\xce\xb9'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000981 # 3.9 Self-reverting case folding U+01F0 and normalization.
982 # The original test case is bogus, it says `\xc7\xf0'
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000983 (b'\xc7\xb0',
984 b'\xc7\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000985 # 3.10 Self-reverting case folding U+0390 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000986 (b'\xce\x90',
987 b'\xce\x90'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988 # 3.11 Self-reverting case folding U+03B0 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000989 (b'\xce\xb0',
990 b'\xce\xb0'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 # 3.12 Self-reverting case folding U+1E96 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000992 (b'\xe1\xba\x96',
993 b'\xe1\xba\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000994 # 3.13 Self-reverting case folding U+1F56 and normalization.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000995 (b'\xe1\xbd\x96',
996 b'\xe1\xbd\x96'),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000997 # 3.14 ASCII space character U+0020.
Walter Dörwald0ac30f82007-05-11 10:32:57 +0000998 (b' ',
999 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001000 # 3.15 Non-ASCII 8bit space character U+00A0.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001001 (b'\xc2\xa0',
1002 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001003 # 3.16 Non-ASCII multibyte space character U+1680.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001004 (b'\xe1\x9a\x80',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001005 None),
1006 # 3.17 Non-ASCII multibyte space character U+2000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001007 (b'\xe2\x80\x80',
1008 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001009 # 3.18 Zero Width Space U+200b.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001010 (b'\xe2\x80\x8b',
1011 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001012 # 3.19 Non-ASCII multibyte space character U+3000.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001013 (b'\xe3\x80\x80',
1014 b' '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001015 # 3.20 ASCII control characters U+0010 U+007F.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001016 (b'\x10\x7f',
1017 b'\x10\x7f'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001018 # 3.21 Non-ASCII 8bit control character U+0085.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001019 (b'\xc2\x85',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001020 None),
1021 # 3.22 Non-ASCII multibyte control character U+180E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001022 (b'\xe1\xa0\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001023 None),
1024 # 3.23 Zero Width No-Break Space U+FEFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001025 (b'\xef\xbb\xbf',
1026 b''),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001027 # 3.24 Non-ASCII control character U+1D175.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001028 (b'\xf0\x9d\x85\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001029 None),
1030 # 3.25 Plane 0 private use character U+F123.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001031 (b'\xef\x84\xa3',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001032 None),
1033 # 3.26 Plane 15 private use character U+F1234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001034 (b'\xf3\xb1\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001035 None),
1036 # 3.27 Plane 16 private use character U+10F234.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001037 (b'\xf4\x8f\x88\xb4',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001038 None),
1039 # 3.28 Non-character code point U+8FFFE.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001040 (b'\xf2\x8f\xbf\xbe',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001041 None),
1042 # 3.29 Non-character code point U+10FFFF.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001043 (b'\xf4\x8f\xbf\xbf',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001044 None),
1045 # 3.30 Surrogate code U+DF42.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001046 (b'\xed\xbd\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047 None),
1048 # 3.31 Non-plain text character U+FFFD.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001049 (b'\xef\xbf\xbd',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001050 None),
1051 # 3.32 Ideographic description character U+2FF5.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001052 (b'\xe2\xbf\xb5',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001053 None),
1054 # 3.33 Display property character U+0341.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001055 (b'\xcd\x81',
1056 b'\xcc\x81'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001057 # 3.34 Left-to-right mark U+200E.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001058 (b'\xe2\x80\x8e',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001059 None),
1060 # 3.35 Deprecated U+202A.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001061 (b'\xe2\x80\xaa',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001062 None),
1063 # 3.36 Language tagging character U+E0001.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001064 (b'\xf3\xa0\x80\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001065 None),
1066 # 3.37 Language tagging character U+E0042.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001067 (b'\xf3\xa0\x81\x82',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001068 None),
1069 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001070 (b'foo\xd6\xbebar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001071 None),
1072 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001073 (b'foo\xef\xb5\x90bar',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001074 None),
1075 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001076 (b'foo\xef\xb9\xb6bar',
1077 b'foo \xd9\x8ebar'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001078 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001079 (b'\xd8\xa71',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001080 None),
1081 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001082 (b'\xd8\xa71\xd8\xa8',
1083 b'\xd8\xa71\xd8\xa8'),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001084 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001085 # Skip this test as we allow unassigned
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001086 #(b'\xf3\xa0\x80\x82',
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001087 # None),
1088 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001089 # 3.44 Larger test (shrinking).
1090 # Original test case reads \xc3\xdf
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001091 (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1092 b'\xaa\xce\xb0\xe2\x80\x80',
1093 b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001094 # 3.45 Larger test (expanding).
1095 # Original test case reads \xc3\x9f
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001096 (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1097 b'\x80',
1098 b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1099 b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1100 b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
Martin v. Löwis2548c732003-04-18 10:39:54 +00001101 ]
1102
1103
1104class NameprepTest(unittest.TestCase):
1105 def test_nameprep(self):
1106 from encodings.idna import nameprep
1107 for pos, (orig, prepped) in enumerate(nameprep_tests):
1108 if orig is None:
1109 # Skipped
1110 continue
1111 # The Unicode strings are given in UTF-8
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001112 orig = str(orig, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001113 if prepped is None:
1114 # Input contains prohibited characters
1115 self.assertRaises(UnicodeError, nameprep, orig)
1116 else:
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001117 prepped = str(prepped, "utf-8", "surrogatepass")
Martin v. Löwis2548c732003-04-18 10:39:54 +00001118 try:
Ezio Melottib3aedd42010-11-20 19:04:17 +00001119 self.assertEqual(nameprep(orig), prepped)
Guido van Rossumb940e112007-01-10 16:19:56 +00001120 except Exception as e:
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001121 raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001122
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001123class IDNACodecTest(unittest.TestCase):
1124 def test_builtin_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001125 self.assertEqual(str(b"python.org", "idna"), "python.org")
1126 self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1127 self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1128 self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001129
1130 def test_builtin_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001131 self.assertEqual("python.org".encode("idna"), b"python.org")
1132 self.assertEqual("python.org.".encode("idna"), b"python.org.")
1133 self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1134 self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001135
Martin v. Löwis8b595142005-08-25 11:03:38 +00001136 def test_stream(self):
Victor Stinner05010702011-05-27 16:50:40 +02001137 r = codecs.getreader("idna")(io.BytesIO(b"abc"))
Martin v. Löwis8b595142005-08-25 11:03:38 +00001138 r.read(3)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001139 self.assertEqual(r.read(), "")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001140
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001141 def test_incremental_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001142 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001143 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001144 "python.org"
1145 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001146 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001147 "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001148 "python.org."
1149 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001150 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001151 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 "pyth\xf6n.org."
1153 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001154 self.assertEqual(
Guido van Rossum09549f42007-08-27 20:40:10 +00001155 "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001156 "pyth\xf6n.org."
1157 )
1158
1159 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001160 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1161 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1162 self.assertEqual(decoder.decode(b"rg"), "")
1163 self.assertEqual(decoder.decode(b"", True), "org")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001164
1165 decoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001166 self.assertEqual(decoder.decode(b"xn--xam", ), "")
1167 self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1168 self.assertEqual(decoder.decode(b"rg."), "org.")
1169 self.assertEqual(decoder.decode(b"", True), "")
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001170
1171 def test_incremental_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001172 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001173 b"".join(codecs.iterencode("python.org", "idna")),
1174 b"python.org"
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001175 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001176 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001177 b"".join(codecs.iterencode("python.org.", "idna")),
1178 b"python.org."
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001179 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001180 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001181 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1182 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001183 )
Ezio Melottib3aedd42010-11-20 19:04:17 +00001184 self.assertEqual(
Walter Dörwald0ac30f82007-05-11 10:32:57 +00001185 b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1186 b"xn--pythn-mua.org."
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001187 )
1188
1189 encoder = codecs.getincrementalencoder("idna")()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001190 self.assertEqual(encoder.encode("\xe4x"), b"")
1191 self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1192 self.assertEqual(encoder.encode("", True), b"org")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001193
1194 encoder.reset()
Ezio Melottib3aedd42010-11-20 19:04:17 +00001195 self.assertEqual(encoder.encode("\xe4x"), b"")
1196 self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1197 self.assertEqual(encoder.encode("", True), b"")
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001198
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001199class CodecsModuleTest(unittest.TestCase):
1200
1201 def test_decode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001202 self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1203 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001204 self.assertRaises(TypeError, codecs.decode)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001205 self.assertEqual(codecs.decode(b'abc'), 'abc')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001206 self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001207
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001208 def test_encode(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001209 self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1210 b'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001211 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001212 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melottib3aedd42010-11-20 19:04:17 +00001213 self.assertEqual(codecs.encode('abc'), b'abc')
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001214 self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001215
1216 def test_register(self):
1217 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001218 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001219
1220 def test_lookup(self):
1221 self.assertRaises(TypeError, codecs.lookup)
1222 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001223 self.assertRaises(LookupError, codecs.lookup, " ")
1224
1225 def test_getencoder(self):
1226 self.assertRaises(TypeError, codecs.getencoder)
1227 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1228
1229 def test_getdecoder(self):
1230 self.assertRaises(TypeError, codecs.getdecoder)
1231 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1232
1233 def test_getreader(self):
1234 self.assertRaises(TypeError, codecs.getreader)
1235 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1236
1237 def test_getwriter(self):
1238 self.assertRaises(TypeError, codecs.getwriter)
1239 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001240
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001241 def test_lookup_issue1813(self):
1242 # Issue #1813: under Turkish locales, lookup of some codecs failed
1243 # because 'I' is lowercased as "ı" (dotless i)
Antoine Pitroud05066d2011-07-26 23:55:33 +02001244 oldlocale = locale.setlocale(locale.LC_CTYPE)
Antoine Pitroucf9d3c02011-07-24 02:27:04 +02001245 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1246 try:
1247 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1248 except locale.Error:
1249 # Unsupported locale on this system
1250 self.skipTest('test needs Turkish locale')
1251 c = codecs.lookup('ASCII')
1252 self.assertEqual(c.name, 'ascii')
1253
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001254class StreamReaderTest(unittest.TestCase):
1255
1256 def setUp(self):
Victor Stinner05010702011-05-27 16:50:40 +02001257 self.reader = codecs.getreader('utf-8')
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001258 self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001259
1260 def test_readlines(self):
Victor Stinner05010702011-05-27 16:50:40 +02001261 f = self.reader(self.stream)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001262 self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001263
Thomas Wouters89f507f2006-12-13 04:49:30 +00001264class EncodedFileTest(unittest.TestCase):
1265
1266 def test_basic(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001267 f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
Victor Stinner05010702011-05-27 16:50:40 +02001268 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001269 self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001270
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001271 f = io.BytesIO()
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001272 ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001273 ef.write(b'\xc3\xbc')
Ezio Melottib3aedd42010-11-20 19:04:17 +00001274 self.assertEqual(f.getvalue(), b'\xfc')
Thomas Wouters89f507f2006-12-13 04:49:30 +00001275
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001276all_unicode_encodings = [
1277 "ascii",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001278 "big5",
1279 "big5hkscs",
1280 "charmap",
1281 "cp037",
1282 "cp1006",
1283 "cp1026",
1284 "cp1140",
1285 "cp1250",
1286 "cp1251",
1287 "cp1252",
1288 "cp1253",
1289 "cp1254",
1290 "cp1255",
1291 "cp1256",
1292 "cp1257",
1293 "cp1258",
1294 "cp424",
1295 "cp437",
1296 "cp500",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001297 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001298 "cp737",
1299 "cp775",
1300 "cp850",
1301 "cp852",
1302 "cp855",
1303 "cp856",
1304 "cp857",
Benjamin Peterson5a6214a2010-06-27 22:41:29 +00001305 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001306 "cp860",
1307 "cp861",
1308 "cp862",
1309 "cp863",
1310 "cp864",
1311 "cp865",
1312 "cp866",
1313 "cp869",
1314 "cp874",
1315 "cp875",
1316 "cp932",
1317 "cp949",
1318 "cp950",
1319 "euc_jis_2004",
1320 "euc_jisx0213",
1321 "euc_jp",
1322 "euc_kr",
1323 "gb18030",
1324 "gb2312",
1325 "gbk",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001326 "hp_roman8",
1327 "hz",
1328 "idna",
1329 "iso2022_jp",
1330 "iso2022_jp_1",
1331 "iso2022_jp_2",
1332 "iso2022_jp_2004",
1333 "iso2022_jp_3",
1334 "iso2022_jp_ext",
1335 "iso2022_kr",
1336 "iso8859_1",
1337 "iso8859_10",
1338 "iso8859_11",
1339 "iso8859_13",
1340 "iso8859_14",
1341 "iso8859_15",
1342 "iso8859_16",
1343 "iso8859_2",
1344 "iso8859_3",
1345 "iso8859_4",
1346 "iso8859_5",
1347 "iso8859_6",
1348 "iso8859_7",
1349 "iso8859_8",
1350 "iso8859_9",
1351 "johab",
1352 "koi8_r",
1353 "koi8_u",
1354 "latin_1",
1355 "mac_cyrillic",
1356 "mac_greek",
1357 "mac_iceland",
1358 "mac_latin2",
1359 "mac_roman",
1360 "mac_turkish",
1361 "palmos",
1362 "ptcp154",
1363 "punycode",
1364 "raw_unicode_escape",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001365 "shift_jis",
1366 "shift_jis_2004",
1367 "shift_jisx0213",
1368 "tis_620",
1369 "unicode_escape",
1370 "unicode_internal",
1371 "utf_16",
1372 "utf_16_be",
1373 "utf_16_le",
1374 "utf_7",
1375 "utf_8",
1376]
1377
1378if hasattr(codecs, "mbcs_encode"):
1379 all_unicode_encodings.append("mbcs")
1380
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001381# The following encoding is not tested, because it's not supposed
1382# to work:
1383# "undefined"
1384
1385# The following encodings don't work in stateful mode
1386broken_unicode_with_streams = [
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001387 "punycode",
1388 "unicode_internal"
1389]
Walter Dörwald3abcb012007-04-16 22:10:50 +00001390broken_incremental_coders = broken_unicode_with_streams + [
1391 "idna",
Walter Dörwald3abcb012007-04-16 22:10:50 +00001392]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001393
Walter Dörwald3abcb012007-04-16 22:10:50 +00001394class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001395 def test_basics(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001396 s = "abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001397 for encoding in all_unicode_encodings:
Thomas Woutersa9773292006-04-21 09:43:23 +00001398 name = codecs.lookup(encoding).name
1399 if encoding.endswith("_codec"):
1400 name += "_codec"
1401 elif encoding == "latin_1":
1402 name = "latin_1"
1403 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001404 (b, size) = codecs.getencoder(encoding)(s)
Walter Dörwald8dc33d52009-05-06 14:41:26 +00001405 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001406 (chars, size) = codecs.getdecoder(encoding)(b)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001407 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1408
1409 if encoding not in broken_unicode_with_streams:
1410 # check stream reader/writer
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001411 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001412 writer = codecs.getwriter(encoding)(q)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001413 encodedresult = b""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001414 for c in s:
1415 writer.write(c)
Guido van Rossum98297ee2007-11-06 21:34:58 +00001416 chunk = q.read()
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001417 self.assertTrue(type(chunk) is bytes, type(chunk))
Guido van Rossum98297ee2007-11-06 21:34:58 +00001418 encodedresult += chunk
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001419 q = Queue(b"")
Victor Stinner05010702011-05-27 16:50:40 +02001420 reader = codecs.getreader(encoding)(q)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001421 decodedresult = ""
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001422 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001423 q.write(bytes([c]))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001424 decodedresult += reader.read()
1425 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1426
Thomas Wouters89f507f2006-12-13 04:49:30 +00001427 if encoding not in broken_incremental_coders:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001428 # check incremental decoder/encoder (fetched via the Python
1429 # and C API) and iterencode()/iterdecode()
Thomas Woutersa9773292006-04-21 09:43:23 +00001430 try:
1431 encoder = codecs.getincrementalencoder(encoding)()
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432 cencoder = _testcapi.codec_incrementalencoder(encoding)
Thomas Woutersa9773292006-04-21 09:43:23 +00001433 except LookupError: # no IncrementalEncoder
1434 pass
1435 else:
1436 # check incremental decoder/encoder
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001437 encodedresult = b""
Thomas Woutersa9773292006-04-21 09:43:23 +00001438 for c in s:
1439 encodedresult += encoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001440 encodedresult += encoder.encode("", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001441 decoder = codecs.getincrementaldecoder(encoding)()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001442 decodedresult = ""
Thomas Woutersa9773292006-04-21 09:43:23 +00001443 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001444 decodedresult += decoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001445 decodedresult += decoder.decode(b"", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001446 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1447
1448 # check C API
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001449 encodedresult = b""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001450 for c in s:
1451 encodedresult += cencoder.encode(c)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001452 encodedresult += cencoder.encode("", True)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001453 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001454 decodedresult = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001455 for c in encodedresult:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001456 decodedresult += cdecoder.decode(bytes([c]))
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001457 decodedresult += cdecoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001458 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1459
1460 # check iterencode()/iterdecode()
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001461 result = "".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
Thomas Woutersa9773292006-04-21 09:43:23 +00001462 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1463
1464 # check iterencode()/iterdecode() with empty string
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001465 result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
1466 self.assertEqual(result, "")
Thomas Woutersa9773292006-04-21 09:43:23 +00001467
Victor Stinner554f3f02010-06-16 23:33:54 +00001468 if encoding not in ("idna", "mbcs"):
Thomas Wouters89f507f2006-12-13 04:49:30 +00001469 # check incremental decoder/encoder with errors argument
1470 try:
1471 encoder = codecs.getincrementalencoder(encoding)("ignore")
1472 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1473 except LookupError: # no IncrementalEncoder
1474 pass
1475 else:
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001476 encodedresult = b"".join(encoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001477 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001478 decodedresult = "".join(decoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001479 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1480
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001481 encodedresult = b"".join(cencoder.encode(c) for c in s)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001482 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001483 decodedresult = "".join(cdecoder.decode(bytes([c])) for c in encodedresult)
Thomas Wouters89f507f2006-12-13 04:49:30 +00001484 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1485
Walter Dörwald729c31f2005-03-14 19:06:30 +00001486 def test_seek(self):
1487 # all codecs should be able to encode these
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001488 s = "%s\n%s\n" % (100*"abc123", 100*"def456")
Walter Dörwald729c31f2005-03-14 19:06:30 +00001489 for encoding in all_unicode_encodings:
1490 if encoding == "idna": # FIXME: See SF bug #1163178
1491 continue
1492 if encoding in broken_unicode_with_streams:
1493 continue
Victor Stinner05010702011-05-27 16:50:40 +02001494 reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
Guido van Rossum805365e2007-05-07 22:24:25 +00001495 for t in range(5):
Walter Dörwald729c31f2005-03-14 19:06:30 +00001496 # Test that calling seek resets the internal codec state and buffers
1497 reader.seek(0, 0)
Guido van Rossumf4cfc8f2007-05-17 21:52:23 +00001498 data = reader.read()
1499 self.assertEqual(s, data)
Walter Dörwald729c31f2005-03-14 19:06:30 +00001500
Walter Dörwalde22d3392005-11-17 08:52:34 +00001501 def test_bad_decode_args(self):
1502 for encoding in all_unicode_encodings:
1503 decoder = codecs.getdecoder(encoding)
1504 self.assertRaises(TypeError, decoder)
1505 if encoding not in ("idna", "punycode"):
1506 self.assertRaises(TypeError, decoder, 42)
1507
1508 def test_bad_encode_args(self):
1509 for encoding in all_unicode_encodings:
1510 encoder = codecs.getencoder(encoding)
1511 self.assertRaises(TypeError, encoder)
1512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001513 def test_encoding_map_type_initialized(self):
1514 from encodings import cp1140
1515 # This used to crash, we are only verifying there's no crash.
1516 table_type = type(cp1140.encoding_table)
1517 self.assertEqual(table_type, table_type)
1518
Walter Dörwald3abcb012007-04-16 22:10:50 +00001519 def test_decoder_state(self):
1520 # Check that getstate() and setstate() handle the state properly
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001521 u = "abc123"
Walter Dörwald3abcb012007-04-16 22:10:50 +00001522 for encoding in all_unicode_encodings:
1523 if encoding not in broken_incremental_coders:
1524 self.check_state_handling_decode(encoding, u, u.encode(encoding))
1525 self.check_state_handling_encode(encoding, u, u.encode(encoding))
1526
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001527class CharmapTest(unittest.TestCase):
1528 def test_decode_with_string_map(self):
Ezio Melottib3aedd42010-11-20 19:04:17 +00001529 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001530 codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001531 ("abc", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001532 )
1533
Ezio Melottib3aedd42010-11-20 19:04:17 +00001534 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001535 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001536 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001537 )
1538
Ezio Melottib3aedd42010-11-20 19:04:17 +00001539 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001540 codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001541 ("ab\ufffd", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001542 )
1543
Ezio Melottib3aedd42010-11-20 19:04:17 +00001544 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001545 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001546 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001547 )
1548
Ezio Melottib3aedd42010-11-20 19:04:17 +00001549 self.assertEqual(
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001550 codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001551 ("ab", 3)
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001552 )
1553
Guido van Rossum805365e2007-05-07 22:24:25 +00001554 allbytes = bytes(range(256))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001555 self.assertEqual(
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001556 codecs.charmap_decode(allbytes, "ignore", ""),
1557 ("", len(allbytes))
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001558 )
1559
Thomas Wouters89f507f2006-12-13 04:49:30 +00001560class WithStmtTest(unittest.TestCase):
1561 def test_encodedfile(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001562 f = io.BytesIO(b"\xc3\xbc")
Victor Stinner05010702011-05-27 16:50:40 +02001563 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1564 self.assertEqual(ef.read(), b"\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001565
1566 def test_streamreaderwriter(self):
Walter Dörwaldc3ab0a72007-05-10 15:02:49 +00001567 f = io.BytesIO(b"\xc3\xbc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001568 info = codecs.lookup("utf-8")
Victor Stinner05010702011-05-27 16:50:40 +02001569 with codecs.StreamReaderWriter(f, info.streamreader,
1570 info.streamwriter, 'strict') as srw:
1571 self.assertEqual(srw.read(), "\xfc")
Thomas Wouters89f507f2006-12-13 04:49:30 +00001572
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001573class TypesTest(unittest.TestCase):
1574 def test_decode_unicode(self):
1575 # Most decoders don't accept unicode input
1576 decoders = [
1577 codecs.utf_7_decode,
1578 codecs.utf_8_decode,
1579 codecs.utf_16_le_decode,
1580 codecs.utf_16_be_decode,
1581 codecs.utf_16_ex_decode,
1582 codecs.utf_32_decode,
1583 codecs.utf_32_le_decode,
1584 codecs.utf_32_be_decode,
1585 codecs.utf_32_ex_decode,
1586 codecs.latin_1_decode,
1587 codecs.ascii_decode,
1588 codecs.charmap_decode,
1589 ]
1590 if hasattr(codecs, "mbcs_decode"):
1591 decoders.append(codecs.mbcs_decode)
1592 for decoder in decoders:
1593 self.assertRaises(TypeError, decoder, "xxx")
1594
1595 def test_unicode_escape(self):
1596 # Escape-decoding an unicode string is supported ang gives the same
1597 # result as decoding the equivalent ASCII bytes string.
Ezio Melottib3aedd42010-11-20 19:04:17 +00001598 self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1599 self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
1600 self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
1601 self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001602
Martin v. Löwis43c57782009-05-10 08:15:24 +00001603class SurrogateEscapeTest(unittest.TestCase):
Martin v. Löwis011e8422009-05-05 04:43:17 +00001604
1605 def test_utf8(self):
1606 # Bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001607 self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001608 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001609 self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001610 b"foo\x80bar")
1611 # bad-utf-8 encoded surrogate
Martin v. Löwis43c57782009-05-10 08:15:24 +00001612 self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001613 "\udced\udcb0\udc80")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001614 self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001615 b"\xed\xb0\x80")
1616
1617 def test_ascii(self):
1618 # bad byte
Martin v. Löwis43c57782009-05-10 08:15:24 +00001619 self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001620 "foo\udc80bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001621 self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001622 b"foo\x80bar")
1623
1624 def test_charmap(self):
1625 # bad byte: \xa5 is unmapped in iso-8859-3
Martin v. Löwis43c57782009-05-10 08:15:24 +00001626 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001627 "foo\udca5bar")
Martin v. Löwis43c57782009-05-10 08:15:24 +00001628 self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
Martin v. Löwis011e8422009-05-05 04:43:17 +00001629 b"foo\xa5bar")
1630
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001631 def test_latin1(self):
1632 # Issue6373
Marc-André Lemburg8f36af72011-02-25 15:42:01 +00001633 self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00001634 b"\xe4\xeb\xef\xf6\xfc")
1635
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001636
Victor Stinner3fed0872010-05-22 02:16:27 +00001637class BomTest(unittest.TestCase):
1638 def test_seek0(self):
1639 data = "1234567890"
1640 tests = ("utf-16",
1641 "utf-16-le",
1642 "utf-16-be",
1643 "utf-32",
1644 "utf-32-le",
1645 "utf-32-be")
Victor Stinner2cca0572011-05-23 14:51:42 +02001646 self.addCleanup(support.unlink, support.TESTFN)
Victor Stinner3fed0872010-05-22 02:16:27 +00001647 for encoding in tests:
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001648 # Check if the BOM is written only once
1649 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner3fed0872010-05-22 02:16:27 +00001650 f.write(data)
1651 f.write(data)
1652 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001653 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001654 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001655 self.assertEqual(f.read(), data * 2)
Victor Stinner3fed0872010-05-22 02:16:27 +00001656
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001657 # Check that the BOM is written after a seek(0)
1658 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1659 f.write(data[0])
Ezio Melottib3aedd42010-11-20 19:04:17 +00001660 self.assertNotEqual(f.tell(), 0)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001661 f.seek(0)
1662 f.write(data)
1663 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001664 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001665
1666 # (StreamWriter) Check that the BOM is written after a seek(0)
1667 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001668 f.writer.write(data[0])
1669 self.assertNotEqual(f.writer.tell(), 0)
1670 f.writer.seek(0)
1671 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001672 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001673 self.assertEqual(f.read(), data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001674
Victor Stinner05010702011-05-27 16:50:40 +02001675 # Check that the BOM is not written after a seek() at a position
1676 # different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001677 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
1678 f.write(data)
1679 f.seek(f.tell())
1680 f.write(data)
1681 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001682 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001683
Victor Stinner05010702011-05-27 16:50:40 +02001684 # (StreamWriter) Check that the BOM is not written after a seek()
1685 # at a position different than the start
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001686 with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner05010702011-05-27 16:50:40 +02001687 f.writer.write(data)
1688 f.writer.seek(f.writer.tell())
1689 f.writer.write(data)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001690 f.seek(0)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001691 self.assertEqual(f.read(), data * 2)
Victor Stinnera92ad7e2010-05-22 16:59:09 +00001692
Victor Stinner3fed0872010-05-22 02:16:27 +00001693
Georg Brandl02524622010-12-02 18:06:51 +00001694bytes_transform_encodings = [
1695 "base64_codec",
1696 "uu_codec",
1697 "quopri_codec",
1698 "hex_codec",
1699]
1700try:
1701 import zlib
1702except ImportError:
1703 pass
1704else:
1705 bytes_transform_encodings.append("zlib_codec")
1706try:
1707 import bz2
1708except ImportError:
1709 pass
1710else:
1711 bytes_transform_encodings.append("bz2_codec")
1712
1713class TransformCodecTest(unittest.TestCase):
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001714
Georg Brandl02524622010-12-02 18:06:51 +00001715 def test_basics(self):
1716 binput = bytes(range(256))
Georg Brandl02524622010-12-02 18:06:51 +00001717 for encoding in bytes_transform_encodings:
1718 # generic codecs interface
1719 (o, size) = codecs.getencoder(encoding)(binput)
1720 self.assertEqual(size, len(binput))
1721 (i, size) = codecs.getdecoder(encoding)(o)
1722 self.assertEqual(size, len(o))
1723 self.assertEqual(i, binput)
1724
Georg Brandl02524622010-12-02 18:06:51 +00001725 def test_read(self):
1726 for encoding in bytes_transform_encodings:
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001727 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001728 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001729 sout = reader.read()
1730 self.assertEqual(sout, b"\x80")
1731
1732 def test_readline(self):
1733 for encoding in bytes_transform_encodings:
1734 if encoding in ['uu_codec', 'zlib_codec']:
1735 continue
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00001736 sin = codecs.encode(b"\x80", encoding)
Victor Stinner05010702011-05-27 16:50:40 +02001737 reader = codecs.getreader(encoding)(io.BytesIO(sin))
Georg Brandl02524622010-12-02 18:06:51 +00001738 sout = reader.readline()
1739 self.assertEqual(sout, b"\x80")
1740
1741
Fred Drake2e2be372001-09-20 21:33:42 +00001742def test_main():
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001743 support.run_unittest(
Walter Dörwald41980ca2007-08-16 21:55:45 +00001744 UTF32Test,
1745 UTF32LETest,
1746 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001747 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001748 UTF16LETest,
1749 UTF16BETest,
1750 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001751 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001752 UTF7Test,
1753 UTF16ExTest,
1754 ReadBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001755 RecodingTest,
1756 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001757 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001758 NameprepTest,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001759 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001760 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001761 StreamReaderTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001762 EncodedFileTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001763 BasicUnicodeTest,
Thomas Wouters89f507f2006-12-13 04:49:30 +00001764 CharmapTest,
1765 WithStmtTest,
Antoine Pitrou81fabdb2009-01-22 10:11:36 +00001766 TypesTest,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001767 SurrogateEscapeTest,
Victor Stinner3fed0872010-05-22 02:16:27 +00001768 BomTest,
Georg Brandl02524622010-12-02 18:06:51 +00001769 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001770 )
Fred Drake2e2be372001-09-20 21:33:42 +00001771
1772
1773if __name__ == "__main__":
1774 test_main()