blob: 9ae0ed084815d78692a7020cf3c9a680b421e797 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02005import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
152 # Issue #16636: Test readline() followed by readlines()
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.readlines(), lines[1:])
156 self.assertEqual(f.read(), '')
157
158 # Test read() followed by read()
159 f = getreader()
160 self.assertEqual(f.read(size=40, chars=5), data[:5])
161 self.assertEqual(f.read(), data[5:])
162 self.assertEqual(f.read(), '')
163
164 # Issue #12446: Test read() followed by readlines()
165 f = getreader()
166 self.assertEqual(f.read(size=40, chars=5), data[:5])
167 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
168 self.assertEqual(f.read(), '')
169
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 def test_bug1175396(self):
171 s = [
172 '<%!--===================================================\r\n',
173 ' BLOG index page: show recent articles,\r\n',
174 ' today\'s articles, or articles of a specific date.\r\n',
175 '========================================================--%>\r\n',
176 '<%@inputencoding="ISO-8859-1"%>\r\n',
177 '<%@pagetemplate=TEMPLATE.y%>\r\n',
178 '<%@import=import frog.util, frog%>\r\n',
179 '<%@import=import frog.objects%>\r\n',
180 '<%@import=from frog.storageerrors import StorageError%>\r\n',
181 '<%\r\n',
182 '\r\n',
183 'import logging\r\n',
184 'log=logging.getLogger("Snakelets.logger")\r\n',
185 '\r\n',
186 '\r\n',
187 'user=self.SessionCtx.user\r\n',
188 'storageEngine=self.SessionCtx.storageEngine\r\n',
189 '\r\n',
190 '\r\n',
191 'def readArticlesFromDate(date, count=None):\r\n',
192 ' entryids=storageEngine.listBlogEntries(date)\r\n',
193 ' entryids.reverse() # descending\r\n',
194 ' if count:\r\n',
195 ' entryids=entryids[:count]\r\n',
196 ' try:\r\n',
197 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
198 ' except StorageError,x:\r\n',
199 ' log.error("Error loading articles: "+str(x))\r\n',
200 ' self.abort("cannot load articles")\r\n',
201 '\r\n',
202 'showdate=None\r\n',
203 '\r\n',
204 'arg=self.Request.getArg()\r\n',
205 'if arg=="today":\r\n',
206 ' #-------------------- TODAY\'S ARTICLES\r\n',
207 ' self.write("<h2>Today\'s articles</h2>")\r\n',
208 ' showdate = frog.util.isodatestr() \r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'elif arg=="active":\r\n',
211 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
212 ' self.Yredirect("active.y")\r\n',
213 'elif arg=="login":\r\n',
214 ' #-------------------- LOGIN PAGE redirect\r\n',
215 ' self.Yredirect("login.y")\r\n',
216 'elif arg=="date":\r\n',
217 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
218 ' showdate = self.Request.getParameter("date")\r\n',
219 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
220 ' entries = readArticlesFromDate(showdate)\r\n',
221 'else:\r\n',
222 ' #-------------------- RECENT ARTICLES\r\n',
223 ' self.write("<h2>Recent articles</h2>")\r\n',
224 ' dates=storageEngine.listBlogEntryDates()\r\n',
225 ' if dates:\r\n',
226 ' entries=[]\r\n',
227 ' SHOWAMOUNT=10\r\n',
228 ' for showdate in dates:\r\n',
229 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
230 ' if len(entries)>=SHOWAMOUNT:\r\n',
231 ' break\r\n',
232 ' \r\n',
233 ]
234 stream = StringIO.StringIO("".join(s).encode(self.encoding))
235 reader = codecs.getreader(self.encoding)(stream)
236 for (i, line) in enumerate(reader):
237 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 def test_readlinequeue(self):
240 q = Queue()
241 writer = codecs.getwriter(self.encoding)(q)
242 reader = codecs.getreader(self.encoding)(q)
243
244 # No lineends
245 writer.write(u"foo\r")
246 self.assertEqual(reader.readline(keepends=False), u"foo")
247 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000248 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249 self.assertEqual(reader.readline(keepends=False), u"bar")
250 writer.write(u"baz")
251 self.assertEqual(reader.readline(keepends=False), u"baz")
252 self.assertEqual(reader.readline(keepends=False), u"")
253
254 # Lineends
255 writer.write(u"foo\r")
256 self.assertEqual(reader.readline(keepends=True), u"foo\r")
257 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000258 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259 self.assertEqual(reader.readline(keepends=True), u"bar\r")
260 writer.write(u"baz")
261 self.assertEqual(reader.readline(keepends=True), u"baz")
262 self.assertEqual(reader.readline(keepends=True), u"")
263 writer.write(u"foo\r\n")
264 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
265
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 def test_bug1098990_a(self):
267 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
268 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
269 s3 = u"next line.\r\n"
270
271 s = (s1+s2+s3).encode(self.encoding)
272 stream = StringIO.StringIO(s)
273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), u"")
278
279 def test_bug1098990_b(self):
280 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
281 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
282 s3 = u"stillokay:bbbbxx\r\n"
283 s4 = u"broken!!!!badbad\r\n"
284 s5 = u"againokay.\r\n"
285
286 s = (s1+s2+s3+s4+s5).encode(self.encoding)
287 stream = StringIO.StringIO(s)
288 reader = codecs.getreader(self.encoding)(stream)
289 self.assertEqual(reader.readline(), s1)
290 self.assertEqual(reader.readline(), s2)
291 self.assertEqual(reader.readline(), s3)
292 self.assertEqual(reader.readline(), s4)
293 self.assertEqual(reader.readline(), s5)
294 self.assertEqual(reader.readline(), u"")
295
Walter Dörwald6e390802007-08-17 16:41:28 +0000296class UTF32Test(ReadTest):
297 encoding = "utf-32"
298
299 spamle = ('\xff\xfe\x00\x00'
300 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
301 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
302 spambe = ('\x00\x00\xfe\xff'
303 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
304 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
305
306 def test_only_one_bom(self):
307 _,_,reader,writer = codecs.lookup(self.encoding)
308 # encode some stream
309 s = StringIO.StringIO()
310 f = writer(s)
311 f.write(u"spam")
312 f.write(u"spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000316 # try to read it back
317 s = StringIO.StringIO(d)
318 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000319 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000320
321 def test_badbom(self):
322 s = StringIO.StringIO(4*"\xff")
323 f = codecs.getreader(self.encoding)(s)
324 self.assertRaises(UnicodeError, f.read)
325
326 s = StringIO.StringIO(8*"\xff")
327 f = codecs.getreader(self.encoding)(s)
328 self.assertRaises(UnicodeError, f.read)
329
330 def test_partial(self):
331 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200332 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000333 [
334 u"", # first byte of BOM read
335 u"", # second byte of BOM read
336 u"", # third byte of BOM read
337 u"", # fourth byte of BOM read => byteorder known
338 u"",
339 u"",
340 u"",
341 u"\x00",
342 u"\x00",
343 u"\x00",
344 u"\x00",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200354 u"\x00\xff\u0100\uffff",
355 u"\x00\xff\u0100\uffff",
356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 ]
359 )
360
Georg Brandle9741f32009-09-17 11:28:09 +0000361 def test_handlers(self):
362 self.assertEqual((u'\ufffd', 1),
363 codecs.utf_32_decode('\x01', 'replace', True))
364 self.assertEqual((u'', 1),
365 codecs.utf_32_decode('\x01', 'ignore', True))
366
Walter Dörwald6e390802007-08-17 16:41:28 +0000367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
369 "\xff", "strict", True)
370
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000371 def test_issue8941(self):
372 # Issue #8941: insufficient result allocation when decoding into
373 # surrogate pairs on UCS-2 builds.
374 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
375 self.assertEqual(u'\U00010000' * 1024,
376 codecs.utf_32_decode(encoded_le)[0])
377 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
378 self.assertEqual(u'\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_be)[0])
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381class UTF32LETest(ReadTest):
382 encoding = "utf-32-le"
383
384 def test_partial(self):
385 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200386 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000387 [
388 u"",
389 u"",
390 u"",
391 u"\x00",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00\xff",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff\u0100",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000408 ]
409 )
410
411 def test_simple(self):
412 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
416 "\xff", "strict", True)
417
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000418 def test_issue8941(self):
419 # Issue #8941: insufficient result allocation when decoding into
420 # surrogate pairs on UCS-2 builds.
421 encoded = '\x00\x00\x01\x00' * 1024
422 self.assertEqual(u'\U00010000' * 1024,
423 codecs.utf_32_le_decode(encoded)[0])
424
Walter Dörwald6e390802007-08-17 16:41:28 +0000425class UTF32BETest(ReadTest):
426 encoding = "utf-32-be"
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200430 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000431 [
432 u"",
433 u"",
434 u"",
435 u"\x00",
436 u"\x00",
437 u"\x00",
438 u"\x00",
439 u"\x00\xff",
440 u"\x00\xff",
441 u"\x00\xff",
442 u"\x00\xff",
443 u"\x00\xff\u0100",
444 u"\x00\xff\u0100",
445 u"\x00\xff\u0100",
446 u"\x00\xff\u0100",
447 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200448 u"\x00\xff\u0100\uffff",
449 u"\x00\xff\u0100\uffff",
450 u"\x00\xff\u0100\uffff",
451 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 "\xff", "strict", True)
461
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = '\x00\x01\x00\x00' * 1024
466 self.assertEqual(u'\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
473 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
479 s = StringIO.StringIO()
480 f = writer(s)
481 f.write(u"spam")
482 f.write(u"spam")
483 d = s.getvalue()
484 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000485 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486 # try to read it back
487 s = StringIO.StringIO(d)
488 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000489 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000491 def test_badbom(self):
492 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000493 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000494 self.assertRaises(UnicodeError, f.read)
495
496 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000497 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 self.assertRaises(UnicodeError, f.read)
499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def test_partial(self):
501 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200502 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000503 [
504 u"", # first byte of BOM read
505 u"", # second byte of BOM read => byteorder known
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000518 ]
519 )
520
Georg Brandle9741f32009-09-17 11:28:09 +0000521 def test_handlers(self):
522 self.assertEqual((u'\ufffd', 1),
523 codecs.utf_16_decode('\x01', 'replace', True))
524 self.assertEqual((u'', 1),
525 codecs.utf_16_decode('\x01', 'ignore', True))
526
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527 def test_errors(self):
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
529
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000530 def test_bug691291(self):
531 # Files are always opened in binary mode, even if no binary mode was
532 # specified. This means that no automatic conversion of '\n' is done
533 # on reading and writing.
534 s1 = u'Hello\r\nworld\r\n'
535
536 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200537 self.addCleanup(test_support.unlink, test_support.TESTFN)
538 with open(test_support.TESTFN, 'wb') as fp:
539 fp.write(s)
540 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
541 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543class UTF16LETest(ReadTest):
544 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200548 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000549 [
550 u"",
551 u"\x00",
552 u"\x00",
553 u"\x00\xff",
554 u"\x00\xff",
555 u"\x00\xff\u0100",
556 u"\x00\xff\u0100",
557 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200558 u"\x00\xff\u0100\uffff",
559 u"\x00\xff\u0100\uffff",
560 u"\x00\xff\u0100\uffff",
561 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200566 tests = [
567 (b'\xff', u'\ufffd'),
568 (b'A\x00Z', u'A\ufffd'),
569 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
570 (b'\x00\xd8', u'\ufffd'),
571 (b'\x00\xd8A', u'\ufffd'),
572 (b'\x00\xd8A\x00', u'\ufffdA'),
573 (b'\x00\xdcA\x00', u'\ufffdA'),
574 ]
575 for raw, expected in tests:
576 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
577 raw, 'strict', True)
578 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200585 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
587 u"",
588 u"\x00",
589 u"\x00",
590 u"\x00\xff",
591 u"\x00\xff",
592 u"\x00\xff\u0100",
593 u"\x00\xff\u0100",
594 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200595 u"\x00\xff\u0100\uffff",
596 u"\x00\xff\u0100\uffff",
597 u"\x00\xff\u0100\uffff",
598 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 ]
600 )
601
Walter Dörwalde22d3392005-11-17 08:52:34 +0000602 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200603 tests = [
604 (b'\xff', u'\ufffd'),
605 (b'\x00A\xff', u'A\ufffd'),
606 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
607 (b'\xd8\x00', u'\ufffd'),
608 (b'\xd8\x00\xdc', u'\ufffd'),
609 (b'\xd8\x00\x00A', u'\ufffdA'),
610 (b'\xdc\x00\x00A', u'\ufffdA'),
611 ]
612 for raw, expected in tests:
613 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
614 raw, 'strict', True)
615 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000617class UTF8Test(ReadTest):
618 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000619
620 def test_partial(self):
621 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200622 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000623 [
624 u"\x00",
625 u"\x00",
626 u"\x00\xff",
627 u"\x00\xff",
628 u"\x00\xff\u07ff",
629 u"\x00\xff\u07ff",
630 u"\x00\xff\u07ff",
631 u"\x00\xff\u07ff\u0800",
632 u"\x00\xff\u07ff\u0800",
633 u"\x00\xff\u07ff\u0800",
634 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200635 u"\x00\xff\u07ff\u0800\uffff",
636 u"\x00\xff\u07ff\u0800\uffff",
637 u"\x00\xff\u07ff\u0800\uffff",
638 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000639 ]
640 )
641
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642class UTF7Test(ReadTest):
643 encoding = "utf-7"
644
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300645 def test_ascii(self):
646 # Set D (directly encoded characters)
647 set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
648 'abcdefghijklmnopqrstuvwxyz'
649 '0123456789'
650 '\'(),-./:?')
651 self.assertEqual(set_d.encode(self.encoding), set_d)
652 self.assertEqual(set_d.decode(self.encoding), set_d)
653 # Set O (optional direct characters)
654 set_o = ' !"#$%&*;<=>@[]^_`{|}'
655 self.assertEqual(set_o.encode(self.encoding), set_o)
656 self.assertEqual(set_o.decode(self.encoding), set_o)
657 # +
658 self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
659 self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
660 # White spaces
661 ws = ' \t\n\r'
662 self.assertEqual(ws.encode(self.encoding), ws)
663 self.assertEqual(ws.decode(self.encoding), ws)
664 # Other ASCII characters
665 other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
666 set(set_d + set_o + '+' + ws)))
667 self.assertEqual(other_ascii.encode(self.encoding),
668 '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
669 'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
670
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000671 def test_partial(self):
672 self.check_partial(
673 u"a+-b",
674 [
675 u"a",
676 u"a",
677 u"a+",
678 u"a+-",
679 u"a+-b",
680 ]
681 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000682
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300683 def test_errors(self):
684 tests = [
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300685 ('\xffb', u'\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300686 ('a\xffb', u'a\ufffdb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300687 ('a\xff\xffb', u'a\ufffd\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300688 ('a+IK', u'a\ufffd'),
689 ('a+IK-b', u'a\ufffdb'),
690 ('a+IK,b', u'a\ufffdb'),
691 ('a+IKx', u'a\u20ac\ufffd'),
692 ('a+IKx-b', u'a\u20ac\ufffdb'),
693 ('a+IKwgr', u'a\u20ac\ufffd'),
694 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
695 ('a+IKwgr,', u'a\u20ac\ufffd'),
696 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
697 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
698 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
699 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
700 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
701 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
702 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300703 ('a+IKw-b\xff', u'a\u20acb\ufffd'),
704 ('a+IKw\xffb', u'a\u20ac\ufffdb'),
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300705 ]
706 for raw, expected in tests:
707 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
708 raw, 'strict', True)
709 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
710
711 def test_nonbmp(self):
712 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
713 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
714 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
Serhiy Storchakae12f6322015-10-02 13:14:53 +0300715 self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
716 self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
717 self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
718 self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
719 self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
720 '+IKwgrNgB3KA-')
721 self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
722 u'\u20ac\u20ac\U000104A0')
723 self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
724 u'\u20ac\u20ac\U000104A0')
725
726 def test_lone_surrogates(self):
727 tests = [
728 ('a+2AE-b', u'a\ud801b'),
729 ('a+2AE\xffb', u'a\ufffdb'),
730 ('a+2AE', u'a\ufffd'),
731 ('a+2AEA-b', u'a\ufffdb'),
732 ('a+2AH-b', u'a\ufffdb'),
733 ('a+IKzYAQ-b', u'a\u20ac\ud801b'),
734 ('a+IKzYAQ\xffb', u'a\u20ac\ufffdb'),
735 ('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
736 ('a+IKzYAd-b', u'a\u20ac\ufffdb'),
737 ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
738 ('a+IKwgrNgB\xffb', u'a\u20ac\u20ac\ufffdb'),
739 ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
740 ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
741 ]
742 for raw, expected in tests:
743 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300744
Walter Dörwalde22d3392005-11-17 08:52:34 +0000745class UTF16ExTest(unittest.TestCase):
746
747 def test_errors(self):
748 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
749
750 def test_bad_args(self):
751 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
752
753class ReadBufferTest(unittest.TestCase):
754
755 def test_array(self):
756 import array
757 self.assertEqual(
758 codecs.readbuffer_encode(array.array("c", "spam")),
759 ("spam", 4)
760 )
761
762 def test_empty(self):
763 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
764
765 def test_bad_args(self):
766 self.assertRaises(TypeError, codecs.readbuffer_encode)
767 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
768
769class CharBufferTest(unittest.TestCase):
770
771 def test_string(self):
772 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
773
774 def test_empty(self):
775 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
776
777 def test_bad_args(self):
778 self.assertRaises(TypeError, codecs.charbuffer_encode)
779 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
780
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000781class UTF8SigTest(ReadTest):
782 encoding = "utf-8-sig"
783
784 def test_partial(self):
785 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200786 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000787 [
788 u"",
789 u"",
790 u"", # First BOM has been read and skipped
791 u"",
792 u"",
793 u"\ufeff", # Second BOM has been read and emitted
794 u"\ufeff\x00", # "\x00" read and emitted
795 u"\ufeff\x00", # First byte of encoded u"\xff" read
796 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
797 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
798 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
799 u"\ufeff\x00\xff\u07ff",
800 u"\ufeff\x00\xff\u07ff",
801 u"\ufeff\x00\xff\u07ff\u0800",
802 u"\ufeff\x00\xff\u07ff\u0800",
803 u"\ufeff\x00\xff\u07ff\u0800",
804 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200805 u"\ufeff\x00\xff\u07ff\u0800\uffff",
806 u"\ufeff\x00\xff\u07ff\u0800\uffff",
807 u"\ufeff\x00\xff\u07ff\u0800\uffff",
808 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000809 ]
810 )
811
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000812 def test_bug1601501(self):
813 # SF bug #1601501: check that the codec works with a buffer
814 unicode("\xef\xbb\xbf", "utf-8-sig")
815
Walter Dörwald42348272007-04-12 10:35:00 +0000816 def test_bom(self):
817 d = codecs.getincrementaldecoder("utf-8-sig")()
818 s = u"spam"
819 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
820
Walter Dörwald183744d2007-11-19 12:41:10 +0000821 def test_stream_bom(self):
822 unistring = u"ABC\u00A1\u2200XYZ"
823 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
824
825 reader = codecs.getreader("utf-8-sig")
826 for sizehint in [None] + range(1, 11) + \
827 [64, 128, 256, 512, 1024]:
828 istream = reader(StringIO.StringIO(bytestring))
829 ostream = StringIO.StringIO()
830 while 1:
831 if sizehint is not None:
832 data = istream.read(sizehint)
833 else:
834 data = istream.read()
835
836 if not data:
837 break
838 ostream.write(data)
839
840 got = ostream.getvalue()
841 self.assertEqual(got, unistring)
842
843 def test_stream_bare(self):
844 unistring = u"ABC\u00A1\u2200XYZ"
845 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
846
847 reader = codecs.getreader("utf-8-sig")
848 for sizehint in [None] + range(1, 11) + \
849 [64, 128, 256, 512, 1024]:
850 istream = reader(StringIO.StringIO(bytestring))
851 ostream = StringIO.StringIO()
852 while 1:
853 if sizehint is not None:
854 data = istream.read(sizehint)
855 else:
856 data = istream.read()
857
858 if not data:
859 break
860 ostream.write(data)
861
862 got = ostream.getvalue()
863 self.assertEqual(got, unistring)
864
Walter Dörwald8709a422002-09-03 13:53:40 +0000865class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000866 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000867 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000868
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200869 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200870 decode = codecs.escape_decode
871 for b in range(256):
872 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200873 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200874 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200875
876 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200877 decode = codecs.escape_decode
878 check = coding_checker(self, decode)
879 check(b"[\\\n]", b"[]")
880 check(br'[\"]', b'["]')
881 check(br"[\']", b"[']")
882 check(br"[\\]", br"[\]")
883 check(br"[\a]", b"[\x07]")
884 check(br"[\b]", b"[\x08]")
885 check(br"[\t]", b"[\x09]")
886 check(br"[\n]", b"[\x0a]")
887 check(br"[\v]", b"[\x0b]")
888 check(br"[\f]", b"[\x0c]")
889 check(br"[\r]", b"[\x0d]")
890 check(br"[\7]", b"[\x07]")
891 check(br"[\8]", br"[\8]")
892 check(br"[\78]", b"[\x078]")
893 check(br"[\41]", b"[!]")
894 check(br"[\418]", b"[!8]")
895 check(br"[\101]", b"[A]")
896 check(br"[\1010]", b"[A0]")
897 check(br"[\501]", b"[A]")
898 check(br"[\x41]", b"[A]")
899 check(br"[\X41]", br"[\X41]")
900 check(br"[\x410]", b"[A0]")
901 for b in range(256):
902 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200903 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200904 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200905
906 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200907 decode = codecs.escape_decode
908 self.assertRaises(ValueError, decode, br"\x")
909 self.assertRaises(ValueError, decode, br"[\x]")
910 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
911 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
912 self.assertRaises(ValueError, decode, br"\x0")
913 self.assertRaises(ValueError, decode, br"[\x0]")
914 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
915 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200916
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000917class RecodingTest(unittest.TestCase):
918 def test_recoding(self):
919 f = StringIO.StringIO()
920 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
921 f2.write(u"a")
922 f2.close()
923 # Python used to crash on this at exit because of a refcount
924 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000925
Martin v. Löwis2548c732003-04-18 10:39:54 +0000926# From RFC 3492
927punycode_testcases = [
928 # A Arabic (Egyptian):
929 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
930 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
931 "egbpdaj6bu4bxfgehfvwxn"),
932 # B Chinese (simplified):
933 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
934 "ihqwcrb4cv8a8dqg056pqjye"),
935 # C Chinese (traditional):
936 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
937 "ihqwctvzc91f659drss3x8bo0yb"),
938 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
939 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
940 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
941 u"\u0065\u0073\u006B\u0079",
942 "Proprostnemluvesky-uyb24dma41a"),
943 # E Hebrew:
944 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
945 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
946 u"\u05D1\u05E8\u05D9\u05EA",
947 "4dbcagdahymbxekheh6e0a7fei0b"),
948 # F Hindi (Devanagari):
949 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
950 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
951 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
952 u"\u0939\u0948\u0902",
953 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
954
955 #(G) Japanese (kanji and hiragana):
956 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
957 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
958 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
959
960 # (H) Korean (Hangul syllables):
961 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
962 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
963 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
964 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
965 "psd879ccm6fea98c"),
966
967 # (I) Russian (Cyrillic):
968 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
969 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
970 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
971 u"\u0438",
972 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
973
974 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
975 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
976 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
977 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
978 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
979 u"\u0061\u00F1\u006F\u006C",
980 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
981
982 # (K) Vietnamese:
983 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
984 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
985 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
986 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
987 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
988 u"\u0056\u0069\u1EC7\u0074",
989 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
990
Martin v. Löwis2548c732003-04-18 10:39:54 +0000991 #(L) 3<nen>B<gumi><kinpachi><sensei>
992 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
993 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000994
Martin v. Löwis2548c732003-04-18 10:39:54 +0000995 # (M) <amuro><namie>-with-SUPER-MONKEYS
996 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
997 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
998 u"\u004F\u004E\u004B\u0045\u0059\u0053",
999 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1000
1001 # (N) Hello-Another-Way-<sorezore><no><basho>
1002 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1003 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1004 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1005 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1006
1007 # (O) <hitotsu><yane><no><shita>2
1008 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1009 "2-u9tlzr9756bt3uc0v"),
1010
1011 # (P) Maji<de>Koi<suru>5<byou><mae>
1012 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1013 u"\u308B\u0035\u79D2\u524D",
1014 "MajiKoi5-783gue6qz075azm5e"),
1015
1016 # (Q) <pafii>de<runba>
1017 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1018 "de-jg4avhby1noc0d"),
1019
1020 # (R) <sono><supiido><de>
1021 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1022 "d9juau41awczczp"),
1023
1024 # (S) -> $1.00 <-
1025 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1026 u"\u003C\u002D",
1027 "-> $1.00 <--")
1028 ]
1029
1030for i in punycode_testcases:
1031 if len(i)!=2:
1032 print repr(i)
1033
1034class PunycodeTest(unittest.TestCase):
1035 def test_encode(self):
1036 for uni, puny in punycode_testcases:
1037 # Need to convert both strings to lower case, since
1038 # some of the extended encodings use upper case, but our
1039 # code produces only lower case. Converting just puny to
1040 # lower is also insufficient, since some of the input characters
1041 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +00001042 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +00001043
1044 def test_decode(self):
1045 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +00001046 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +00001047
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001048class UnicodeInternalTest(unittest.TestCase):
1049 def test_bug1251300(self):
1050 # Decoding with unicode_internal used to not correctly handle "code
1051 # points" above 0x10ffff on UCS-4 builds.
1052 if sys.maxunicode > 0xffff:
1053 ok = [
1054 ("\x00\x10\xff\xff", u"\U0010ffff"),
1055 ("\x00\x00\x01\x01", u"\U00000101"),
1056 ("", u""),
1057 ]
1058 not_ok = [
1059 "\x7f\xff\xff\xff",
1060 "\x80\x00\x00\x00",
1061 "\x81\x00\x00\x00",
1062 "\x00",
1063 "\x00\x00\x00\x00\x00",
1064 ]
1065 for internal, uni in ok:
1066 if sys.byteorder == "little":
1067 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001068 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001069 for internal in not_ok:
1070 if sys.byteorder == "little":
1071 internal = "".join(reversed(internal))
1072 self.assertRaises(UnicodeDecodeError, internal.decode,
1073 "unicode_internal")
1074
1075 def test_decode_error_attributes(self):
1076 if sys.maxunicode > 0xffff:
1077 try:
1078 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1079 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001080 self.assertEqual("unicode_internal", ex.encoding)
1081 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1082 self.assertEqual(4, ex.start)
1083 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001084 else:
1085 self.fail()
1086
1087 def test_decode_callback(self):
1088 if sys.maxunicode > 0xffff:
1089 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1090 decoder = codecs.getdecoder("unicode_internal")
1091 ab = u"ab".encode("unicode_internal")
1092 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1093 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001094 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001095
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001096 def test_encode_length(self):
1097 # Issue 3739
1098 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001099 self.assertEqual(encoder(u"a")[1], 1)
1100 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001101
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001102 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001103 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001104
Martin v. Löwis2548c732003-04-18 10:39:54 +00001105# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1106nameprep_tests = [
1107 # 3.1 Map to nothing.
1108 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1109 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1110 '\xb8\x8f\xef\xbb\xbf',
1111 'foobarbaz'),
1112 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1113 ('CAFE',
1114 'cafe'),
1115 # 3.3 Case folding 8bit U+00DF (german sharp s).
1116 # The original test case is bogus; it says \xc3\xdf
1117 ('\xc3\x9f',
1118 'ss'),
1119 # 3.4 Case folding U+0130 (turkish capital I with dot).
1120 ('\xc4\xb0',
1121 'i\xcc\x87'),
1122 # 3.5 Case folding multibyte U+0143 U+037A.
1123 ('\xc5\x83\xcd\xba',
1124 '\xc5\x84 \xce\xb9'),
1125 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1126 # XXX: skip this as it fails in UCS-2 mode
1127 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1128 # 'telc\xe2\x88\x95kg\xcf\x83'),
1129 (None, None),
1130 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1131 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1132 '\xc7\xb0 a'),
1133 # 3.8 Case folding U+1FB7 and normalization.
1134 ('\xe1\xbe\xb7',
1135 '\xe1\xbe\xb6\xce\xb9'),
1136 # 3.9 Self-reverting case folding U+01F0 and normalization.
1137 # The original test case is bogus, it says `\xc7\xf0'
1138 ('\xc7\xb0',
1139 '\xc7\xb0'),
1140 # 3.10 Self-reverting case folding U+0390 and normalization.
1141 ('\xce\x90',
1142 '\xce\x90'),
1143 # 3.11 Self-reverting case folding U+03B0 and normalization.
1144 ('\xce\xb0',
1145 '\xce\xb0'),
1146 # 3.12 Self-reverting case folding U+1E96 and normalization.
1147 ('\xe1\xba\x96',
1148 '\xe1\xba\x96'),
1149 # 3.13 Self-reverting case folding U+1F56 and normalization.
1150 ('\xe1\xbd\x96',
1151 '\xe1\xbd\x96'),
1152 # 3.14 ASCII space character U+0020.
1153 (' ',
1154 ' '),
1155 # 3.15 Non-ASCII 8bit space character U+00A0.
1156 ('\xc2\xa0',
1157 ' '),
1158 # 3.16 Non-ASCII multibyte space character U+1680.
1159 ('\xe1\x9a\x80',
1160 None),
1161 # 3.17 Non-ASCII multibyte space character U+2000.
1162 ('\xe2\x80\x80',
1163 ' '),
1164 # 3.18 Zero Width Space U+200b.
1165 ('\xe2\x80\x8b',
1166 ''),
1167 # 3.19 Non-ASCII multibyte space character U+3000.
1168 ('\xe3\x80\x80',
1169 ' '),
1170 # 3.20 ASCII control characters U+0010 U+007F.
1171 ('\x10\x7f',
1172 '\x10\x7f'),
1173 # 3.21 Non-ASCII 8bit control character U+0085.
1174 ('\xc2\x85',
1175 None),
1176 # 3.22 Non-ASCII multibyte control character U+180E.
1177 ('\xe1\xa0\x8e',
1178 None),
1179 # 3.23 Zero Width No-Break Space U+FEFF.
1180 ('\xef\xbb\xbf',
1181 ''),
1182 # 3.24 Non-ASCII control character U+1D175.
1183 ('\xf0\x9d\x85\xb5',
1184 None),
1185 # 3.25 Plane 0 private use character U+F123.
1186 ('\xef\x84\xa3',
1187 None),
1188 # 3.26 Plane 15 private use character U+F1234.
1189 ('\xf3\xb1\x88\xb4',
1190 None),
1191 # 3.27 Plane 16 private use character U+10F234.
1192 ('\xf4\x8f\x88\xb4',
1193 None),
1194 # 3.28 Non-character code point U+8FFFE.
1195 ('\xf2\x8f\xbf\xbe',
1196 None),
1197 # 3.29 Non-character code point U+10FFFF.
1198 ('\xf4\x8f\xbf\xbf',
1199 None),
1200 # 3.30 Surrogate code U+DF42.
1201 ('\xed\xbd\x82',
1202 None),
1203 # 3.31 Non-plain text character U+FFFD.
1204 ('\xef\xbf\xbd',
1205 None),
1206 # 3.32 Ideographic description character U+2FF5.
1207 ('\xe2\xbf\xb5',
1208 None),
1209 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001210 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001211 '\xcc\x81'),
1212 # 3.34 Left-to-right mark U+200E.
1213 ('\xe2\x80\x8e',
1214 None),
1215 # 3.35 Deprecated U+202A.
1216 ('\xe2\x80\xaa',
1217 None),
1218 # 3.36 Language tagging character U+E0001.
1219 ('\xf3\xa0\x80\x81',
1220 None),
1221 # 3.37 Language tagging character U+E0042.
1222 ('\xf3\xa0\x81\x82',
1223 None),
1224 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1225 ('foo\xd6\xbebar',
1226 None),
1227 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1228 ('foo\xef\xb5\x90bar',
1229 None),
1230 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1231 ('foo\xef\xb9\xb6bar',
1232 'foo \xd9\x8ebar'),
1233 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1234 ('\xd8\xa71',
1235 None),
1236 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1237 ('\xd8\xa71\xd8\xa8',
1238 '\xd8\xa71\xd8\xa8'),
1239 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001240 # Skip this test as we allow unassigned
1241 #('\xf3\xa0\x80\x82',
1242 # None),
1243 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001244 # 3.44 Larger test (shrinking).
1245 # Original test case reads \xc3\xdf
1246 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1247 '\xaa\xce\xb0\xe2\x80\x80',
1248 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1249 # 3.45 Larger test (expanding).
1250 # Original test case reads \xc3\x9f
1251 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1252 '\x80',
1253 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1254 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1255 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1256 ]
1257
1258
1259class NameprepTest(unittest.TestCase):
1260 def test_nameprep(self):
1261 from encodings.idna import nameprep
1262 for pos, (orig, prepped) in enumerate(nameprep_tests):
1263 if orig is None:
1264 # Skipped
1265 continue
1266 # The Unicode strings are given in UTF-8
1267 orig = unicode(orig, "utf-8")
1268 if prepped is None:
1269 # Input contains prohibited characters
1270 self.assertRaises(UnicodeError, nameprep, orig)
1271 else:
1272 prepped = unicode(prepped, "utf-8")
1273 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001274 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001275 except Exception,e:
1276 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1277
Walter Dörwald78a0be62006-04-14 18:25:39 +00001278class IDNACodecTest(unittest.TestCase):
1279 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001280 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1281 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1282 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1283 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001284
1285 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001286 self.assertEqual(u"python.org".encode("idna"), "python.org")
1287 self.assertEqual("python.org.".encode("idna"), "python.org.")
1288 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1289 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001290
Martin v. Löwis8b595142005-08-25 11:03:38 +00001291 def test_stream(self):
1292 import StringIO
1293 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1294 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001295 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001296
Walter Dörwald78a0be62006-04-14 18:25:39 +00001297 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001298 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001299 "".join(codecs.iterdecode("python.org", "idna")),
1300 u"python.org"
1301 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001302 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001303 "".join(codecs.iterdecode("python.org.", "idna")),
1304 u"python.org."
1305 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001306 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001307 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1308 u"pyth\xf6n.org."
1309 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001310 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001311 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1312 u"pyth\xf6n.org."
1313 )
1314
1315 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001316 self.assertEqual(decoder.decode("xn--xam", ), u"")
1317 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1318 self.assertEqual(decoder.decode(u"rg"), u"")
1319 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001320
1321 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001322 self.assertEqual(decoder.decode("xn--xam", ), u"")
1323 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1324 self.assertEqual(decoder.decode("rg."), u"org.")
1325 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001326
1327 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001328 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001329 "".join(codecs.iterencode(u"python.org", "idna")),
1330 "python.org"
1331 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001332 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001333 "".join(codecs.iterencode(u"python.org.", "idna")),
1334 "python.org."
1335 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001336 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001337 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1338 "xn--pythn-mua.org."
1339 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001340 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001341 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1342 "xn--pythn-mua.org."
1343 )
1344
1345 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001346 self.assertEqual(encoder.encode(u"\xe4x"), "")
1347 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1348 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001349
1350 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001351 self.assertEqual(encoder.encode(u"\xe4x"), "")
1352 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1353 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001354
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001355class CodecsModuleTest(unittest.TestCase):
1356
1357 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001358 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001359 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001360 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001361 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001362 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1363
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001364 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001365 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001366 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001367 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001368 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001369 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001370 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1371
1372 def test_register(self):
1373 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001374 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001375
1376 def test_lookup(self):
1377 self.assertRaises(TypeError, codecs.lookup)
1378 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001379 self.assertRaises(LookupError, codecs.lookup, " ")
1380
1381 def test_getencoder(self):
1382 self.assertRaises(TypeError, codecs.getencoder)
1383 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1384
1385 def test_getdecoder(self):
1386 self.assertRaises(TypeError, codecs.getdecoder)
1387 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1388
1389 def test_getreader(self):
1390 self.assertRaises(TypeError, codecs.getreader)
1391 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1392
1393 def test_getwriter(self):
1394 self.assertRaises(TypeError, codecs.getwriter)
1395 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001396
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001397 def test_lookup_issue1813(self):
1398 # Issue #1813: under Turkish locales, lookup of some codecs failed
1399 # because 'I' is lowercased as a dotless "i"
1400 oldlocale = locale.getlocale(locale.LC_CTYPE)
1401 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1402 try:
1403 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1404 except locale.Error:
1405 # Unsupported locale on this system
1406 self.skipTest('test needs Turkish locale')
1407 c = codecs.lookup('ASCII')
1408 self.assertEqual(c.name, 'ascii')
1409
Serhiy Storchaka74a651b2014-12-20 17:42:24 +02001410 def test_all(self):
1411 api = (
1412 "encode", "decode",
1413 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1414 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1415 "getencoder", "getdecoder", "getincrementalencoder",
1416 "getincrementaldecoder", "getreader", "getwriter",
1417 "register_error", "lookup_error",
1418 "strict_errors", "replace_errors", "ignore_errors",
1419 "xmlcharrefreplace_errors", "backslashreplace_errors",
1420 "open", "EncodedFile",
1421 "iterencode", "iterdecode",
1422 "BOM", "BOM_BE", "BOM_LE",
1423 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1424 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1425 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1426 "StreamReaderWriter", "StreamRecoder",
1427 )
1428 self.assertEqual(sorted(api), sorted(codecs.__all__))
1429 for api in codecs.__all__:
1430 getattr(codecs, api)
1431
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001432class StreamReaderTest(unittest.TestCase):
1433
1434 def setUp(self):
1435 self.reader = codecs.getreader('utf-8')
1436 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1437
1438 def test_readlines(self):
1439 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001440 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001441
Georg Brandl8f99f812006-10-29 08:39:22 +00001442class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001443
Georg Brandl8f99f812006-10-29 08:39:22 +00001444 def test_basic(self):
1445 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001446 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001447 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001448
1449 f = StringIO.StringIO()
1450 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1451 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001452 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001453
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001454class Str2StrTest(unittest.TestCase):
1455
1456 def test_read(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001457 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001458 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1459 sout = reader.read()
1460 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001461 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001462
1463 def test_readline(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001464 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001465 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1466 sout = reader.readline()
1467 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001468 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001469
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001470all_unicode_encodings = [
1471 "ascii",
1472 "base64_codec",
1473 "big5",
1474 "big5hkscs",
1475 "charmap",
1476 "cp037",
1477 "cp1006",
1478 "cp1026",
1479 "cp1140",
1480 "cp1250",
1481 "cp1251",
1482 "cp1252",
1483 "cp1253",
1484 "cp1254",
1485 "cp1255",
1486 "cp1256",
1487 "cp1257",
1488 "cp1258",
1489 "cp424",
1490 "cp437",
1491 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001492 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001493 "cp737",
1494 "cp775",
1495 "cp850",
1496 "cp852",
1497 "cp855",
1498 "cp856",
1499 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001500 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001501 "cp860",
1502 "cp861",
1503 "cp862",
1504 "cp863",
1505 "cp864",
1506 "cp865",
1507 "cp866",
1508 "cp869",
1509 "cp874",
1510 "cp875",
1511 "cp932",
1512 "cp949",
1513 "cp950",
1514 "euc_jis_2004",
1515 "euc_jisx0213",
1516 "euc_jp",
1517 "euc_kr",
1518 "gb18030",
1519 "gb2312",
1520 "gbk",
1521 "hex_codec",
1522 "hp_roman8",
1523 "hz",
1524 "idna",
1525 "iso2022_jp",
1526 "iso2022_jp_1",
1527 "iso2022_jp_2",
1528 "iso2022_jp_2004",
1529 "iso2022_jp_3",
1530 "iso2022_jp_ext",
1531 "iso2022_kr",
1532 "iso8859_1",
1533 "iso8859_10",
1534 "iso8859_11",
1535 "iso8859_13",
1536 "iso8859_14",
1537 "iso8859_15",
1538 "iso8859_16",
1539 "iso8859_2",
1540 "iso8859_3",
1541 "iso8859_4",
1542 "iso8859_5",
1543 "iso8859_6",
1544 "iso8859_7",
1545 "iso8859_8",
1546 "iso8859_9",
1547 "johab",
1548 "koi8_r",
1549 "koi8_u",
1550 "latin_1",
1551 "mac_cyrillic",
1552 "mac_greek",
1553 "mac_iceland",
1554 "mac_latin2",
1555 "mac_roman",
1556 "mac_turkish",
1557 "palmos",
1558 "ptcp154",
1559 "punycode",
1560 "raw_unicode_escape",
1561 "rot_13",
1562 "shift_jis",
1563 "shift_jis_2004",
1564 "shift_jisx0213",
1565 "tis_620",
1566 "unicode_escape",
1567 "unicode_internal",
1568 "utf_16",
1569 "utf_16_be",
1570 "utf_16_le",
1571 "utf_7",
1572 "utf_8",
1573]
1574
1575if hasattr(codecs, "mbcs_encode"):
1576 all_unicode_encodings.append("mbcs")
1577
1578# The following encodings work only with str, not unicode
1579all_string_encodings = [
1580 "quopri_codec",
1581 "string_escape",
1582 "uu_codec",
1583]
1584
1585# The following encoding is not tested, because it's not supposed
1586# to work:
1587# "undefined"
1588
1589# The following encodings don't work in stateful mode
1590broken_unicode_with_streams = [
1591 "base64_codec",
1592 "hex_codec",
1593 "punycode",
1594 "unicode_internal"
1595]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001596broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001597
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001598if sys.flags.py3k_warning:
1599 broken_unicode_with_streams.append("rot_13")
1600
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001601# The following encodings only support "strict" mode
1602only_strict_mode = [
1603 "idna",
1604 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001605 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001606]
1607
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001608try:
1609 import bz2
1610except ImportError:
1611 pass
1612else:
1613 all_unicode_encodings.append("bz2_codec")
1614 broken_unicode_with_streams.append("bz2_codec")
1615
1616try:
1617 import zlib
1618except ImportError:
1619 pass
1620else:
1621 all_unicode_encodings.append("zlib_codec")
1622 broken_unicode_with_streams.append("zlib_codec")
1623
1624class BasicUnicodeTest(unittest.TestCase):
1625 def test_basics(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001626 s = u"abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001627 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001628 name = codecs.lookup(encoding).name
1629 if encoding.endswith("_codec"):
1630 name += "_codec"
1631 elif encoding == "latin_1":
1632 name = "latin_1"
1633 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001634 (bytes, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001635 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001636 (chars, size) = codecs.getdecoder(encoding)(bytes)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001637 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001638
1639 if encoding not in broken_unicode_with_streams:
1640 # check stream reader/writer
1641 q = Queue()
1642 writer = codecs.getwriter(encoding)(q)
1643 encodedresult = ""
1644 for c in s:
1645 writer.write(c)
1646 encodedresult += q.read()
1647 q = Queue()
1648 reader = codecs.getreader(encoding)(q)
1649 decodedresult = u""
1650 for c in encodedresult:
1651 q.write(c)
1652 decodedresult += reader.read()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001653 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001654
Georg Brandl2c9838e2006-10-29 14:39:09 +00001655 if encoding not in broken_incremental_coders:
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001656 # check incremental decoder/encoder and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001657 try:
1658 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001659 except LookupError: # no IncrementalEncoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001660 pass
1661 else:
1662 # check incremental decoder/encoder
1663 encodedresult = ""
1664 for c in s:
1665 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001666 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001667 decoder = codecs.getincrementaldecoder(encoding)()
1668 decodedresult = u""
1669 for c in encodedresult:
1670 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001671 decodedresult += decoder.decode("", True)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001672 self.assertEqual(decodedresult, s,
1673 "encoding=%r" % encoding)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001674
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001675 # check iterencode()/iterdecode()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001676 result = u"".join(codecs.iterdecode(
1677 codecs.iterencode(s, encoding), encoding))
1678 self.assertEqual(result, s, "encoding=%r" % encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001679
1680 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001681 result = u"".join(codecs.iterdecode(
1682 codecs.iterencode(u"", encoding), encoding))
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001683 self.assertEqual(result, u"")
1684
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001685 if encoding not in only_strict_mode:
1686 # check incremental decoder/encoder with errors argument
1687 try:
1688 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001689 except LookupError: # no IncrementalEncoder
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001690 pass
1691 else:
1692 encodedresult = "".join(encoder.encode(c) for c in s)
1693 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001694 decodedresult = u"".join(decoder.decode(c)
1695 for c in encodedresult)
1696 self.assertEqual(decodedresult, s,
1697 "encoding=%r" % encoding)
Tim Petersabd8a332006-11-03 02:32:46 +00001698
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001699 @test_support.cpython_only
1700 def test_basics_capi(self):
1701 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1702 s = u"abc123" # all codecs should be able to encode these
1703 for encoding in all_unicode_encodings:
1704 if encoding not in broken_incremental_coders:
1705 # check incremental decoder/encoder and iterencode()/iterdecode()
1706 try:
1707 cencoder = codec_incrementalencoder(encoding)
1708 except LookupError: # no IncrementalEncoder
1709 pass
1710 else:
1711 # check C API
1712 encodedresult = ""
1713 for c in s:
1714 encodedresult += cencoder.encode(c)
1715 encodedresult += cencoder.encode(u"", True)
1716 cdecoder = codec_incrementaldecoder(encoding)
1717 decodedresult = u""
1718 for c in encodedresult:
1719 decodedresult += cdecoder.decode(c)
1720 decodedresult += cdecoder.decode("", True)
1721 self.assertEqual(decodedresult, s,
1722 "encoding=%r" % encoding)
1723
1724 if encoding not in only_strict_mode:
1725 # check incremental decoder/encoder with errors argument
1726 try:
1727 cencoder = codec_incrementalencoder(encoding, "ignore")
1728 except LookupError: # no IncrementalEncoder
1729 pass
1730 else:
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001731 encodedresult = "".join(cencoder.encode(c) for c in s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001732 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1733 decodedresult = u"".join(cdecoder.decode(c)
1734 for c in encodedresult)
1735 self.assertEqual(decodedresult, s,
1736 "encoding=%r" % encoding)
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001737
Walter Dörwald729c31f2005-03-14 19:06:30 +00001738 def test_seek(self):
1739 # all codecs should be able to encode these
1740 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1741 for encoding in all_unicode_encodings:
1742 if encoding == "idna": # FIXME: See SF bug #1163178
1743 continue
1744 if encoding in broken_unicode_with_streams:
1745 continue
1746 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1747 for t in xrange(5):
1748 # Test that calling seek resets the internal codec state and buffers
1749 reader.seek(0, 0)
1750 line = reader.readline()
1751 self.assertEqual(s[:len(line)], line)
1752
Walter Dörwalde22d3392005-11-17 08:52:34 +00001753 def test_bad_decode_args(self):
1754 for encoding in all_unicode_encodings:
1755 decoder = codecs.getdecoder(encoding)
1756 self.assertRaises(TypeError, decoder)
1757 if encoding not in ("idna", "punycode"):
1758 self.assertRaises(TypeError, decoder, 42)
1759
1760 def test_bad_encode_args(self):
1761 for encoding in all_unicode_encodings:
1762 encoder = codecs.getencoder(encoding)
1763 self.assertRaises(TypeError, encoder)
1764
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001765 def test_encoding_map_type_initialized(self):
1766 from encodings import cp1140
1767 # This used to crash, we are only verifying there's no crash.
1768 table_type = type(cp1140.encoding_table)
1769 self.assertEqual(table_type, table_type)
1770
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001771class BasicStrTest(unittest.TestCase):
1772 def test_basics(self):
1773 s = "abc123"
1774 for encoding in all_string_encodings:
1775 (bytes, size) = codecs.getencoder(encoding)(s)
1776 self.assertEqual(size, len(s))
1777 (chars, size) = codecs.getdecoder(encoding)(bytes)
1778 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1779
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001780class CharmapTest(unittest.TestCase):
1781 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001782 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001783 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1784 (u"abc", 3)
1785 )
1786
Serhiy Storchaka95997452013-01-15 14:42:59 +02001787 self.assertRaises(UnicodeDecodeError,
1788 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1789 )
1790
1791 self.assertRaises(UnicodeDecodeError,
1792 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1793 )
1794
Ezio Melotti2623a372010-11-21 13:34:58 +00001795 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001796 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1797 (u"ab\ufffd", 3)
1798 )
1799
Ezio Melotti2623a372010-11-21 13:34:58 +00001800 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001801 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1802 (u"ab\ufffd", 3)
1803 )
1804
Ezio Melotti2623a372010-11-21 13:34:58 +00001805 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001806 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1807 (u"ab", 3)
1808 )
1809
Ezio Melotti2623a372010-11-21 13:34:58 +00001810 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001811 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1812 (u"ab", 3)
1813 )
1814
1815 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001816 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001817 codecs.charmap_decode(allbytes, "ignore", u""),
1818 (u"", len(allbytes))
1819 )
1820
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001821 def test_decode_with_int2str_map(self):
1822 self.assertEqual(
1823 codecs.charmap_decode("\x00\x01\x02", "strict",
1824 {0: u'a', 1: u'b', 2: u'c'}),
1825 (u"abc", 3)
1826 )
1827
1828 self.assertEqual(
1829 codecs.charmap_decode("\x00\x01\x02", "strict",
1830 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1831 (u"AaBbCc", 3)
1832 )
1833
1834 self.assertEqual(
1835 codecs.charmap_decode("\x00\x01\x02", "strict",
1836 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1837 (u"\U0010FFFFbc", 3)
1838 )
1839
1840 self.assertEqual(
1841 codecs.charmap_decode("\x00\x01\x02", "strict",
1842 {0: u'a', 1: u'b', 2: u''}),
1843 (u"ab", 3)
1844 )
1845
1846 self.assertRaises(UnicodeDecodeError,
1847 codecs.charmap_decode, "\x00\x01\x02", "strict",
1848 {0: u'a', 1: u'b'}
1849 )
1850
Serhiy Storchaka95997452013-01-15 14:42:59 +02001851 self.assertRaises(UnicodeDecodeError,
1852 codecs.charmap_decode, "\x00\x01\x02", "strict",
1853 {0: u'a', 1: u'b', 2: None}
1854 )
1855
1856 # Issue #14850
1857 self.assertRaises(UnicodeDecodeError,
1858 codecs.charmap_decode, "\x00\x01\x02", "strict",
1859 {0: u'a', 1: u'b', 2: u'\ufffe'}
1860 )
1861
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001862 self.assertEqual(
1863 codecs.charmap_decode("\x00\x01\x02", "replace",
1864 {0: u'a', 1: u'b'}),
1865 (u"ab\ufffd", 3)
1866 )
1867
1868 self.assertEqual(
1869 codecs.charmap_decode("\x00\x01\x02", "replace",
1870 {0: u'a', 1: u'b', 2: None}),
1871 (u"ab\ufffd", 3)
1872 )
1873
Serhiy Storchaka95997452013-01-15 14:42:59 +02001874 # Issue #14850
1875 self.assertEqual(
1876 codecs.charmap_decode("\x00\x01\x02", "replace",
1877 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1878 (u"ab\ufffd", 3)
1879 )
1880
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001881 self.assertEqual(
1882 codecs.charmap_decode("\x00\x01\x02", "ignore",
1883 {0: u'a', 1: u'b'}),
1884 (u"ab", 3)
1885 )
1886
1887 self.assertEqual(
1888 codecs.charmap_decode("\x00\x01\x02", "ignore",
1889 {0: u'a', 1: u'b', 2: None}),
1890 (u"ab", 3)
1891 )
1892
Serhiy Storchaka95997452013-01-15 14:42:59 +02001893 # Issue #14850
1894 self.assertEqual(
1895 codecs.charmap_decode("\x00\x01\x02", "ignore",
1896 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1897 (u"ab", 3)
1898 )
1899
1900 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001901 self.assertEqual(
1902 codecs.charmap_decode(allbytes, "ignore", {}),
1903 (u"", len(allbytes))
1904 )
1905
1906 def test_decode_with_int2int_map(self):
1907 a = ord(u'a')
1908 b = ord(u'b')
1909 c = ord(u'c')
1910
1911 self.assertEqual(
1912 codecs.charmap_decode("\x00\x01\x02", "strict",
1913 {0: a, 1: b, 2: c}),
1914 (u"abc", 3)
1915 )
1916
1917 # Issue #15379
1918 self.assertEqual(
1919 codecs.charmap_decode("\x00\x01\x02", "strict",
1920 {0: 0x10FFFF, 1: b, 2: c}),
1921 (u"\U0010FFFFbc", 3)
1922 )
1923
1924 self.assertRaises(TypeError,
1925 codecs.charmap_decode, "\x00\x01\x02", "strict",
1926 {0: 0x110000, 1: b, 2: c}
1927 )
1928
1929 self.assertRaises(UnicodeDecodeError,
1930 codecs.charmap_decode, "\x00\x01\x02", "strict",
1931 {0: a, 1: b},
1932 )
1933
Serhiy Storchaka95997452013-01-15 14:42:59 +02001934 self.assertRaises(UnicodeDecodeError,
1935 codecs.charmap_decode, "\x00\x01\x02", "strict",
1936 {0: a, 1: b, 2: 0xFFFE},
1937 )
1938
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001939 self.assertEqual(
1940 codecs.charmap_decode("\x00\x01\x02", "replace",
1941 {0: a, 1: b}),
1942 (u"ab\ufffd", 3)
1943 )
1944
1945 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001946 codecs.charmap_decode("\x00\x01\x02", "replace",
1947 {0: a, 1: b, 2: 0xFFFE}),
1948 (u"ab\ufffd", 3)
1949 )
1950
1951 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001952 codecs.charmap_decode("\x00\x01\x02", "ignore",
1953 {0: a, 1: b}),
1954 (u"ab", 3)
1955 )
1956
Serhiy Storchaka95997452013-01-15 14:42:59 +02001957 self.assertEqual(
1958 codecs.charmap_decode("\x00\x01\x02", "ignore",
1959 {0: a, 1: b, 2: 0xFFFE}),
1960 (u"ab", 3)
1961 )
1962
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001963
Georg Brandl8f99f812006-10-29 08:39:22 +00001964class WithStmtTest(unittest.TestCase):
1965 def test_encodedfile(self):
1966 f = StringIO.StringIO("\xc3\xbc")
1967 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001968 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001969
1970 def test_streamreaderwriter(self):
1971 f = StringIO.StringIO("\xc3\xbc")
1972 info = codecs.lookup("utf-8")
1973 with codecs.StreamReaderWriter(f, info.streamreader,
1974 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001975 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001976
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001977
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001978class UnicodeEscapeTest(unittest.TestCase):
1979 def test_empty(self):
1980 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1981 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1982
1983 def test_raw_encode(self):
1984 encode = codecs.unicode_escape_encode
1985 for b in range(32, 127):
1986 if b != ord('\\'):
1987 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1988
1989 def test_raw_decode(self):
1990 decode = codecs.unicode_escape_decode
1991 for b in range(256):
1992 if b != ord('\\'):
1993 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1994
1995 def test_escape_encode(self):
1996 encode = codecs.unicode_escape_encode
1997 check = coding_checker(self, encode)
1998 check(u'\t', r'\t')
1999 check(u'\n', r'\n')
2000 check(u'\r', r'\r')
2001 check(u'\\', r'\\')
2002 for b in range(32):
2003 if chr(b) not in '\t\n\r':
2004 check(unichr(b), '\\x%02x' % b)
2005 for b in range(127, 256):
2006 check(unichr(b), '\\x%02x' % b)
2007 check(u'\u20ac', r'\u20ac')
2008 check(u'\U0001d120', r'\U0001d120')
2009
2010 def test_escape_decode(self):
2011 decode = codecs.unicode_escape_decode
2012 check = coding_checker(self, decode)
2013 check("[\\\n]", u"[]")
2014 check(r'[\"]', u'["]')
2015 check(r"[\']", u"[']")
2016 check(r"[\\]", ur"[\]")
2017 check(r"[\a]", u"[\x07]")
2018 check(r"[\b]", u"[\x08]")
2019 check(r"[\t]", u"[\x09]")
2020 check(r"[\n]", u"[\x0a]")
2021 check(r"[\v]", u"[\x0b]")
2022 check(r"[\f]", u"[\x0c]")
2023 check(r"[\r]", u"[\x0d]")
2024 check(r"[\7]", u"[\x07]")
2025 check(r"[\8]", ur"[\8]")
2026 check(r"[\78]", u"[\x078]")
2027 check(r"[\41]", u"[!]")
2028 check(r"[\418]", u"[!8]")
2029 check(r"[\101]", u"[A]")
2030 check(r"[\1010]", u"[A0]")
2031 check(r"[\x41]", u"[A]")
2032 check(r"[\x410]", u"[A0]")
2033 check(r"\u20ac", u"\u20ac")
2034 check(r"\U0001d120", u"\U0001d120")
2035 for b in range(256):
2036 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
2037 check('\\' + chr(b), u'\\' + unichr(b))
2038
2039 def test_decode_errors(self):
2040 decode = codecs.unicode_escape_decode
2041 for c, d in ('x', 2), ('u', 4), ('U', 4):
2042 for i in range(d):
2043 self.assertRaises(UnicodeDecodeError, decode,
2044 "\\" + c + "0"*i)
2045 self.assertRaises(UnicodeDecodeError, decode,
2046 "[\\" + c + "0"*i + "]")
2047 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2048 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2049 self.assertEqual(decode(data, "replace"),
2050 (u"[\ufffd]\ufffd", len(data)))
2051 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2052 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2053 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2054
2055
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002056class RawUnicodeEscapeTest(unittest.TestCase):
2057 def test_empty(self):
2058 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
2059 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
2060
2061 def test_raw_encode(self):
2062 encode = codecs.raw_unicode_escape_encode
2063 for b in range(256):
2064 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2065
2066 def test_raw_decode(self):
2067 decode = codecs.raw_unicode_escape_decode
2068 for b in range(256):
2069 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2070
2071 def test_escape_encode(self):
2072 encode = codecs.raw_unicode_escape_encode
2073 check = coding_checker(self, encode)
2074 for b in range(256):
2075 if chr(b) not in 'uU':
2076 check(u'\\' + unichr(b), '\\' + chr(b))
2077 check(u'\u20ac', r'\u20ac')
2078 check(u'\U0001d120', r'\U0001d120')
2079
2080 def test_escape_decode(self):
2081 decode = codecs.raw_unicode_escape_decode
2082 check = coding_checker(self, decode)
2083 for b in range(256):
2084 if chr(b) not in 'uU':
2085 check('\\' + chr(b), u'\\' + unichr(b))
2086 check(r"\u20ac", u"\u20ac")
2087 check(r"\U0001d120", u"\U0001d120")
2088
2089 def test_decode_errors(self):
2090 decode = codecs.raw_unicode_escape_decode
2091 for c, d in ('u', 4), ('U', 4):
2092 for i in range(d):
2093 self.assertRaises(UnicodeDecodeError, decode,
2094 "\\" + c + "0"*i)
2095 self.assertRaises(UnicodeDecodeError, decode,
2096 "[\\" + c + "0"*i + "]")
2097 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2098 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2099 self.assertEqual(decode(data, "replace"),
2100 (u"[\ufffd]\ufffd", len(data)))
2101 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2102 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2103 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2104
2105
Victor Stinner262be5e2010-05-22 02:11:07 +00002106class BomTest(unittest.TestCase):
2107 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002108 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002109 tests = ("utf-16",
2110 "utf-16-le",
2111 "utf-16-be",
2112 "utf-32",
2113 "utf-32-le",
2114 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002115 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002116 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002117 # Check if the BOM is written only once
2118 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002119 f.write(data)
2120 f.write(data)
2121 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002122 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002123 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002124 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002125
Victor Stinner7df55da2010-05-22 13:37:56 +00002126 # Check that the BOM is written after a seek(0)
2127 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2128 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002129 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002130 f.seek(0)
2131 f.write(data)
2132 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002133 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002134
2135 # (StreamWriter) Check that the BOM is written after a seek(0)
2136 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2137 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002138 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002139 f.writer.seek(0)
2140 f.writer.write(data)
2141 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002142 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002143
2144 # Check that the BOM is not written after a seek() at a position
2145 # different than the start
2146 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2147 f.write(data)
2148 f.seek(f.tell())
2149 f.write(data)
2150 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002151 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002152
2153 # (StreamWriter) Check that the BOM is not written after a seek()
2154 # at a position different than the start
2155 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2156 f.writer.write(data)
2157 f.writer.seek(f.writer.tell())
2158 f.writer.write(data)
2159 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002160 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002161
Victor Stinner262be5e2010-05-22 02:11:07 +00002162
Martin Panter90bc71f2015-09-12 02:20:06 +00002163class TransformCodecTest(unittest.TestCase):
2164
Martin Panterb2528c92015-09-12 00:34:28 +00002165 def test_quopri_stateless(self):
2166 # Should encode with quotetabs=True
2167 encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2168 self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2169 # But should still support unescaped tabs and spaces
2170 unescaped = b"space tab eol\n"
2171 self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2172
Martin Panter90bc71f2015-09-12 02:20:06 +00002173 def test_uu_invalid(self):
2174 # Missing "begin" line
2175 self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2176
2177
Fred Drake2e2be372001-09-20 21:33:42 +00002178def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002179 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002180 UTF32Test,
2181 UTF32LETest,
2182 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002183 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002184 UTF16LETest,
2185 UTF16BETest,
2186 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002187 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002188 UTF7Test,
2189 UTF16ExTest,
2190 ReadBufferTest,
2191 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002192 EscapeDecodeTest,
2193 RecodingTest,
2194 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002195 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002196 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002197 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002198 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002199 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002200 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002201 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002202 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002203 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002204 CharmapTest,
2205 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002206 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002207 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002208 BomTest,
Martin Panter90bc71f2015-09-12 02:20:06 +00002209 TransformCodecTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002210 )
Fred Drake2e2be372001-09-20 21:33:42 +00002211
2212
2213if __name__ == "__main__":
2214 test_main()