blob: c7072a65bed4a5da75a9c099638e74d68a821c7f [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02005import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
152 # Issue #16636: Test readline() followed by readlines()
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.readlines(), lines[1:])
156 self.assertEqual(f.read(), '')
157
158 # Test read() followed by read()
159 f = getreader()
160 self.assertEqual(f.read(size=40, chars=5), data[:5])
161 self.assertEqual(f.read(), data[5:])
162 self.assertEqual(f.read(), '')
163
164 # Issue #12446: Test read() followed by readlines()
165 f = getreader()
166 self.assertEqual(f.read(size=40, chars=5), data[:5])
167 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
168 self.assertEqual(f.read(), '')
169
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 def test_bug1175396(self):
171 s = [
172 '<%!--===================================================\r\n',
173 ' BLOG index page: show recent articles,\r\n',
174 ' today\'s articles, or articles of a specific date.\r\n',
175 '========================================================--%>\r\n',
176 '<%@inputencoding="ISO-8859-1"%>\r\n',
177 '<%@pagetemplate=TEMPLATE.y%>\r\n',
178 '<%@import=import frog.util, frog%>\r\n',
179 '<%@import=import frog.objects%>\r\n',
180 '<%@import=from frog.storageerrors import StorageError%>\r\n',
181 '<%\r\n',
182 '\r\n',
183 'import logging\r\n',
184 'log=logging.getLogger("Snakelets.logger")\r\n',
185 '\r\n',
186 '\r\n',
187 'user=self.SessionCtx.user\r\n',
188 'storageEngine=self.SessionCtx.storageEngine\r\n',
189 '\r\n',
190 '\r\n',
191 'def readArticlesFromDate(date, count=None):\r\n',
192 ' entryids=storageEngine.listBlogEntries(date)\r\n',
193 ' entryids.reverse() # descending\r\n',
194 ' if count:\r\n',
195 ' entryids=entryids[:count]\r\n',
196 ' try:\r\n',
197 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
198 ' except StorageError,x:\r\n',
199 ' log.error("Error loading articles: "+str(x))\r\n',
200 ' self.abort("cannot load articles")\r\n',
201 '\r\n',
202 'showdate=None\r\n',
203 '\r\n',
204 'arg=self.Request.getArg()\r\n',
205 'if arg=="today":\r\n',
206 ' #-------------------- TODAY\'S ARTICLES\r\n',
207 ' self.write("<h2>Today\'s articles</h2>")\r\n',
208 ' showdate = frog.util.isodatestr() \r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'elif arg=="active":\r\n',
211 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
212 ' self.Yredirect("active.y")\r\n',
213 'elif arg=="login":\r\n',
214 ' #-------------------- LOGIN PAGE redirect\r\n',
215 ' self.Yredirect("login.y")\r\n',
216 'elif arg=="date":\r\n',
217 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
218 ' showdate = self.Request.getParameter("date")\r\n',
219 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
220 ' entries = readArticlesFromDate(showdate)\r\n',
221 'else:\r\n',
222 ' #-------------------- RECENT ARTICLES\r\n',
223 ' self.write("<h2>Recent articles</h2>")\r\n',
224 ' dates=storageEngine.listBlogEntryDates()\r\n',
225 ' if dates:\r\n',
226 ' entries=[]\r\n',
227 ' SHOWAMOUNT=10\r\n',
228 ' for showdate in dates:\r\n',
229 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
230 ' if len(entries)>=SHOWAMOUNT:\r\n',
231 ' break\r\n',
232 ' \r\n',
233 ]
234 stream = StringIO.StringIO("".join(s).encode(self.encoding))
235 reader = codecs.getreader(self.encoding)(stream)
236 for (i, line) in enumerate(reader):
237 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 def test_readlinequeue(self):
240 q = Queue()
241 writer = codecs.getwriter(self.encoding)(q)
242 reader = codecs.getreader(self.encoding)(q)
243
244 # No lineends
245 writer.write(u"foo\r")
246 self.assertEqual(reader.readline(keepends=False), u"foo")
247 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000248 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249 self.assertEqual(reader.readline(keepends=False), u"bar")
250 writer.write(u"baz")
251 self.assertEqual(reader.readline(keepends=False), u"baz")
252 self.assertEqual(reader.readline(keepends=False), u"")
253
254 # Lineends
255 writer.write(u"foo\r")
256 self.assertEqual(reader.readline(keepends=True), u"foo\r")
257 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000258 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259 self.assertEqual(reader.readline(keepends=True), u"bar\r")
260 writer.write(u"baz")
261 self.assertEqual(reader.readline(keepends=True), u"baz")
262 self.assertEqual(reader.readline(keepends=True), u"")
263 writer.write(u"foo\r\n")
264 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
265
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 def test_bug1098990_a(self):
267 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
268 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
269 s3 = u"next line.\r\n"
270
271 s = (s1+s2+s3).encode(self.encoding)
272 stream = StringIO.StringIO(s)
273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), u"")
278
279 def test_bug1098990_b(self):
280 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
281 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
282 s3 = u"stillokay:bbbbxx\r\n"
283 s4 = u"broken!!!!badbad\r\n"
284 s5 = u"againokay.\r\n"
285
286 s = (s1+s2+s3+s4+s5).encode(self.encoding)
287 stream = StringIO.StringIO(s)
288 reader = codecs.getreader(self.encoding)(stream)
289 self.assertEqual(reader.readline(), s1)
290 self.assertEqual(reader.readline(), s2)
291 self.assertEqual(reader.readline(), s3)
292 self.assertEqual(reader.readline(), s4)
293 self.assertEqual(reader.readline(), s5)
294 self.assertEqual(reader.readline(), u"")
295
Walter Dörwald6e390802007-08-17 16:41:28 +0000296class UTF32Test(ReadTest):
297 encoding = "utf-32"
298
299 spamle = ('\xff\xfe\x00\x00'
300 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
301 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
302 spambe = ('\x00\x00\xfe\xff'
303 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
304 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
305
306 def test_only_one_bom(self):
307 _,_,reader,writer = codecs.lookup(self.encoding)
308 # encode some stream
309 s = StringIO.StringIO()
310 f = writer(s)
311 f.write(u"spam")
312 f.write(u"spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000316 # try to read it back
317 s = StringIO.StringIO(d)
318 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000319 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000320
321 def test_badbom(self):
322 s = StringIO.StringIO(4*"\xff")
323 f = codecs.getreader(self.encoding)(s)
324 self.assertRaises(UnicodeError, f.read)
325
326 s = StringIO.StringIO(8*"\xff")
327 f = codecs.getreader(self.encoding)(s)
328 self.assertRaises(UnicodeError, f.read)
329
330 def test_partial(self):
331 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200332 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000333 [
334 u"", # first byte of BOM read
335 u"", # second byte of BOM read
336 u"", # third byte of BOM read
337 u"", # fourth byte of BOM read => byteorder known
338 u"",
339 u"",
340 u"",
341 u"\x00",
342 u"\x00",
343 u"\x00",
344 u"\x00",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200354 u"\x00\xff\u0100\uffff",
355 u"\x00\xff\u0100\uffff",
356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 ]
359 )
360
Georg Brandle9741f32009-09-17 11:28:09 +0000361 def test_handlers(self):
362 self.assertEqual((u'\ufffd', 1),
363 codecs.utf_32_decode('\x01', 'replace', True))
364 self.assertEqual((u'', 1),
365 codecs.utf_32_decode('\x01', 'ignore', True))
366
Walter Dörwald6e390802007-08-17 16:41:28 +0000367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
369 "\xff", "strict", True)
370
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000371 def test_issue8941(self):
372 # Issue #8941: insufficient result allocation when decoding into
373 # surrogate pairs on UCS-2 builds.
374 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
375 self.assertEqual(u'\U00010000' * 1024,
376 codecs.utf_32_decode(encoded_le)[0])
377 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
378 self.assertEqual(u'\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_be)[0])
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381class UTF32LETest(ReadTest):
382 encoding = "utf-32-le"
383
384 def test_partial(self):
385 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200386 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000387 [
388 u"",
389 u"",
390 u"",
391 u"\x00",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00\xff",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff\u0100",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000408 ]
409 )
410
411 def test_simple(self):
412 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
416 "\xff", "strict", True)
417
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000418 def test_issue8941(self):
419 # Issue #8941: insufficient result allocation when decoding into
420 # surrogate pairs on UCS-2 builds.
421 encoded = '\x00\x00\x01\x00' * 1024
422 self.assertEqual(u'\U00010000' * 1024,
423 codecs.utf_32_le_decode(encoded)[0])
424
Walter Dörwald6e390802007-08-17 16:41:28 +0000425class UTF32BETest(ReadTest):
426 encoding = "utf-32-be"
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200430 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000431 [
432 u"",
433 u"",
434 u"",
435 u"\x00",
436 u"\x00",
437 u"\x00",
438 u"\x00",
439 u"\x00\xff",
440 u"\x00\xff",
441 u"\x00\xff",
442 u"\x00\xff",
443 u"\x00\xff\u0100",
444 u"\x00\xff\u0100",
445 u"\x00\xff\u0100",
446 u"\x00\xff\u0100",
447 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200448 u"\x00\xff\u0100\uffff",
449 u"\x00\xff\u0100\uffff",
450 u"\x00\xff\u0100\uffff",
451 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 "\xff", "strict", True)
461
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = '\x00\x01\x00\x00' * 1024
466 self.assertEqual(u'\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
473 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
479 s = StringIO.StringIO()
480 f = writer(s)
481 f.write(u"spam")
482 f.write(u"spam")
483 d = s.getvalue()
484 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000485 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486 # try to read it back
487 s = StringIO.StringIO(d)
488 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000489 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000491 def test_badbom(self):
492 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000493 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000494 self.assertRaises(UnicodeError, f.read)
495
496 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000497 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 self.assertRaises(UnicodeError, f.read)
499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def test_partial(self):
501 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200502 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000503 [
504 u"", # first byte of BOM read
505 u"", # second byte of BOM read => byteorder known
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000518 ]
519 )
520
Georg Brandle9741f32009-09-17 11:28:09 +0000521 def test_handlers(self):
522 self.assertEqual((u'\ufffd', 1),
523 codecs.utf_16_decode('\x01', 'replace', True))
524 self.assertEqual((u'', 1),
525 codecs.utf_16_decode('\x01', 'ignore', True))
526
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527 def test_errors(self):
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
529
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000530 def test_bug691291(self):
531 # Files are always opened in binary mode, even if no binary mode was
532 # specified. This means that no automatic conversion of '\n' is done
533 # on reading and writing.
534 s1 = u'Hello\r\nworld\r\n'
535
536 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200537 self.addCleanup(test_support.unlink, test_support.TESTFN)
538 with open(test_support.TESTFN, 'wb') as fp:
539 fp.write(s)
540 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
541 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543class UTF16LETest(ReadTest):
544 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200548 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000549 [
550 u"",
551 u"\x00",
552 u"\x00",
553 u"\x00\xff",
554 u"\x00\xff",
555 u"\x00\xff\u0100",
556 u"\x00\xff\u0100",
557 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200558 u"\x00\xff\u0100\uffff",
559 u"\x00\xff\u0100\uffff",
560 u"\x00\xff\u0100\uffff",
561 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200566 tests = [
567 (b'\xff', u'\ufffd'),
568 (b'A\x00Z', u'A\ufffd'),
569 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
570 (b'\x00\xd8', u'\ufffd'),
571 (b'\x00\xd8A', u'\ufffd'),
572 (b'\x00\xd8A\x00', u'\ufffdA'),
573 (b'\x00\xdcA\x00', u'\ufffdA'),
574 ]
575 for raw, expected in tests:
576 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
577 raw, 'strict', True)
578 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200585 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
587 u"",
588 u"\x00",
589 u"\x00",
590 u"\x00\xff",
591 u"\x00\xff",
592 u"\x00\xff\u0100",
593 u"\x00\xff\u0100",
594 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200595 u"\x00\xff\u0100\uffff",
596 u"\x00\xff\u0100\uffff",
597 u"\x00\xff\u0100\uffff",
598 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 ]
600 )
601
Walter Dörwalde22d3392005-11-17 08:52:34 +0000602 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200603 tests = [
604 (b'\xff', u'\ufffd'),
605 (b'\x00A\xff', u'A\ufffd'),
606 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
607 (b'\xd8\x00', u'\ufffd'),
608 (b'\xd8\x00\xdc', u'\ufffd'),
609 (b'\xd8\x00\x00A', u'\ufffdA'),
610 (b'\xdc\x00\x00A', u'\ufffdA'),
611 ]
612 for raw, expected in tests:
613 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
614 raw, 'strict', True)
615 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000617class UTF8Test(ReadTest):
618 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000619
620 def test_partial(self):
621 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200622 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000623 [
624 u"\x00",
625 u"\x00",
626 u"\x00\xff",
627 u"\x00\xff",
628 u"\x00\xff\u07ff",
629 u"\x00\xff\u07ff",
630 u"\x00\xff\u07ff",
631 u"\x00\xff\u07ff\u0800",
632 u"\x00\xff\u07ff\u0800",
633 u"\x00\xff\u07ff\u0800",
634 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200635 u"\x00\xff\u07ff\u0800\uffff",
636 u"\x00\xff\u07ff\u0800\uffff",
637 u"\x00\xff\u07ff\u0800\uffff",
638 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000639 ]
640 )
641
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642class UTF7Test(ReadTest):
643 encoding = "utf-7"
644
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000645 def test_partial(self):
646 self.check_partial(
647 u"a+-b",
648 [
649 u"a",
650 u"a",
651 u"a+",
652 u"a+-",
653 u"a+-b",
654 ]
655 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000656
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300657 def test_errors(self):
658 tests = [
659 ('a\xffb', u'a\ufffdb'),
660 ('a+IK', u'a\ufffd'),
661 ('a+IK-b', u'a\ufffdb'),
662 ('a+IK,b', u'a\ufffdb'),
663 ('a+IKx', u'a\u20ac\ufffd'),
664 ('a+IKx-b', u'a\u20ac\ufffdb'),
665 ('a+IKwgr', u'a\u20ac\ufffd'),
666 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
667 ('a+IKwgr,', u'a\u20ac\ufffd'),
668 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
669 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
670 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
671 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
672 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
673 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
674 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
675 ]
676 for raw, expected in tests:
677 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
678 raw, 'strict', True)
679 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
680
681 def test_nonbmp(self):
682 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
683 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
684 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
685
Walter Dörwalde22d3392005-11-17 08:52:34 +0000686class UTF16ExTest(unittest.TestCase):
687
688 def test_errors(self):
689 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
690
691 def test_bad_args(self):
692 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
693
694class ReadBufferTest(unittest.TestCase):
695
696 def test_array(self):
697 import array
698 self.assertEqual(
699 codecs.readbuffer_encode(array.array("c", "spam")),
700 ("spam", 4)
701 )
702
703 def test_empty(self):
704 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
705
706 def test_bad_args(self):
707 self.assertRaises(TypeError, codecs.readbuffer_encode)
708 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
709
710class CharBufferTest(unittest.TestCase):
711
712 def test_string(self):
713 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
714
715 def test_empty(self):
716 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
717
718 def test_bad_args(self):
719 self.assertRaises(TypeError, codecs.charbuffer_encode)
720 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
721
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000722class UTF8SigTest(ReadTest):
723 encoding = "utf-8-sig"
724
725 def test_partial(self):
726 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200727 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000728 [
729 u"",
730 u"",
731 u"", # First BOM has been read and skipped
732 u"",
733 u"",
734 u"\ufeff", # Second BOM has been read and emitted
735 u"\ufeff\x00", # "\x00" read and emitted
736 u"\ufeff\x00", # First byte of encoded u"\xff" read
737 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
738 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
739 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
740 u"\ufeff\x00\xff\u07ff",
741 u"\ufeff\x00\xff\u07ff",
742 u"\ufeff\x00\xff\u07ff\u0800",
743 u"\ufeff\x00\xff\u07ff\u0800",
744 u"\ufeff\x00\xff\u07ff\u0800",
745 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200746 u"\ufeff\x00\xff\u07ff\u0800\uffff",
747 u"\ufeff\x00\xff\u07ff\u0800\uffff",
748 u"\ufeff\x00\xff\u07ff\u0800\uffff",
749 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000750 ]
751 )
752
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000753 def test_bug1601501(self):
754 # SF bug #1601501: check that the codec works with a buffer
755 unicode("\xef\xbb\xbf", "utf-8-sig")
756
Walter Dörwald42348272007-04-12 10:35:00 +0000757 def test_bom(self):
758 d = codecs.getincrementaldecoder("utf-8-sig")()
759 s = u"spam"
760 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
761
Walter Dörwald183744d2007-11-19 12:41:10 +0000762 def test_stream_bom(self):
763 unistring = u"ABC\u00A1\u2200XYZ"
764 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
765
766 reader = codecs.getreader("utf-8-sig")
767 for sizehint in [None] + range(1, 11) + \
768 [64, 128, 256, 512, 1024]:
769 istream = reader(StringIO.StringIO(bytestring))
770 ostream = StringIO.StringIO()
771 while 1:
772 if sizehint is not None:
773 data = istream.read(sizehint)
774 else:
775 data = istream.read()
776
777 if not data:
778 break
779 ostream.write(data)
780
781 got = ostream.getvalue()
782 self.assertEqual(got, unistring)
783
784 def test_stream_bare(self):
785 unistring = u"ABC\u00A1\u2200XYZ"
786 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
787
788 reader = codecs.getreader("utf-8-sig")
789 for sizehint in [None] + range(1, 11) + \
790 [64, 128, 256, 512, 1024]:
791 istream = reader(StringIO.StringIO(bytestring))
792 ostream = StringIO.StringIO()
793 while 1:
794 if sizehint is not None:
795 data = istream.read(sizehint)
796 else:
797 data = istream.read()
798
799 if not data:
800 break
801 ostream.write(data)
802
803 got = ostream.getvalue()
804 self.assertEqual(got, unistring)
805
Walter Dörwald8709a422002-09-03 13:53:40 +0000806class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000807 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000808 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000809
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200810 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200811 decode = codecs.escape_decode
812 for b in range(256):
813 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200814 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200815 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200816
817 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200818 decode = codecs.escape_decode
819 check = coding_checker(self, decode)
820 check(b"[\\\n]", b"[]")
821 check(br'[\"]', b'["]')
822 check(br"[\']", b"[']")
823 check(br"[\\]", br"[\]")
824 check(br"[\a]", b"[\x07]")
825 check(br"[\b]", b"[\x08]")
826 check(br"[\t]", b"[\x09]")
827 check(br"[\n]", b"[\x0a]")
828 check(br"[\v]", b"[\x0b]")
829 check(br"[\f]", b"[\x0c]")
830 check(br"[\r]", b"[\x0d]")
831 check(br"[\7]", b"[\x07]")
832 check(br"[\8]", br"[\8]")
833 check(br"[\78]", b"[\x078]")
834 check(br"[\41]", b"[!]")
835 check(br"[\418]", b"[!8]")
836 check(br"[\101]", b"[A]")
837 check(br"[\1010]", b"[A0]")
838 check(br"[\501]", b"[A]")
839 check(br"[\x41]", b"[A]")
840 check(br"[\X41]", br"[\X41]")
841 check(br"[\x410]", b"[A0]")
842 for b in range(256):
843 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200844 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200845 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200846
847 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200848 decode = codecs.escape_decode
849 self.assertRaises(ValueError, decode, br"\x")
850 self.assertRaises(ValueError, decode, br"[\x]")
851 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
852 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
853 self.assertRaises(ValueError, decode, br"\x0")
854 self.assertRaises(ValueError, decode, br"[\x0]")
855 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
856 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200857
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000858class RecodingTest(unittest.TestCase):
859 def test_recoding(self):
860 f = StringIO.StringIO()
861 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
862 f2.write(u"a")
863 f2.close()
864 # Python used to crash on this at exit because of a refcount
865 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000866
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867# From RFC 3492
868punycode_testcases = [
869 # A Arabic (Egyptian):
870 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
871 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
872 "egbpdaj6bu4bxfgehfvwxn"),
873 # B Chinese (simplified):
874 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
875 "ihqwcrb4cv8a8dqg056pqjye"),
876 # C Chinese (traditional):
877 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
878 "ihqwctvzc91f659drss3x8bo0yb"),
879 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
880 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
881 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
882 u"\u0065\u0073\u006B\u0079",
883 "Proprostnemluvesky-uyb24dma41a"),
884 # E Hebrew:
885 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
886 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
887 u"\u05D1\u05E8\u05D9\u05EA",
888 "4dbcagdahymbxekheh6e0a7fei0b"),
889 # F Hindi (Devanagari):
890 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
891 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
892 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
893 u"\u0939\u0948\u0902",
894 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
895
896 #(G) Japanese (kanji and hiragana):
897 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
898 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
899 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
900
901 # (H) Korean (Hangul syllables):
902 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
903 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
904 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
905 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
906 "psd879ccm6fea98c"),
907
908 # (I) Russian (Cyrillic):
909 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
910 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
911 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
912 u"\u0438",
913 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
914
915 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
916 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
917 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
918 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
919 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
920 u"\u0061\u00F1\u006F\u006C",
921 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
922
923 # (K) Vietnamese:
924 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
925 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
926 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
927 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
928 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
929 u"\u0056\u0069\u1EC7\u0074",
930 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932 #(L) 3<nen>B<gumi><kinpachi><sensei>
933 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
934 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000935
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936 # (M) <amuro><namie>-with-SUPER-MONKEYS
937 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
938 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
939 u"\u004F\u004E\u004B\u0045\u0059\u0053",
940 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
941
942 # (N) Hello-Another-Way-<sorezore><no><basho>
943 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
944 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
945 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
946 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
947
948 # (O) <hitotsu><yane><no><shita>2
949 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
950 "2-u9tlzr9756bt3uc0v"),
951
952 # (P) Maji<de>Koi<suru>5<byou><mae>
953 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
954 u"\u308B\u0035\u79D2\u524D",
955 "MajiKoi5-783gue6qz075azm5e"),
956
957 # (Q) <pafii>de<runba>
958 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
959 "de-jg4avhby1noc0d"),
960
961 # (R) <sono><supiido><de>
962 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
963 "d9juau41awczczp"),
964
965 # (S) -> $1.00 <-
966 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
967 u"\u003C\u002D",
968 "-> $1.00 <--")
969 ]
970
971for i in punycode_testcases:
972 if len(i)!=2:
973 print repr(i)
974
975class PunycodeTest(unittest.TestCase):
976 def test_encode(self):
977 for uni, puny in punycode_testcases:
978 # Need to convert both strings to lower case, since
979 # some of the extended encodings use upper case, but our
980 # code produces only lower case. Converting just puny to
981 # lower is also insufficient, since some of the input characters
982 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000983 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984
985 def test_decode(self):
986 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000987 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000989class UnicodeInternalTest(unittest.TestCase):
990 def test_bug1251300(self):
991 # Decoding with unicode_internal used to not correctly handle "code
992 # points" above 0x10ffff on UCS-4 builds.
993 if sys.maxunicode > 0xffff:
994 ok = [
995 ("\x00\x10\xff\xff", u"\U0010ffff"),
996 ("\x00\x00\x01\x01", u"\U00000101"),
997 ("", u""),
998 ]
999 not_ok = [
1000 "\x7f\xff\xff\xff",
1001 "\x80\x00\x00\x00",
1002 "\x81\x00\x00\x00",
1003 "\x00",
1004 "\x00\x00\x00\x00\x00",
1005 ]
1006 for internal, uni in ok:
1007 if sys.byteorder == "little":
1008 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001009 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001010 for internal in not_ok:
1011 if sys.byteorder == "little":
1012 internal = "".join(reversed(internal))
1013 self.assertRaises(UnicodeDecodeError, internal.decode,
1014 "unicode_internal")
1015
1016 def test_decode_error_attributes(self):
1017 if sys.maxunicode > 0xffff:
1018 try:
1019 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1020 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001021 self.assertEqual("unicode_internal", ex.encoding)
1022 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1023 self.assertEqual(4, ex.start)
1024 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001025 else:
1026 self.fail()
1027
1028 def test_decode_callback(self):
1029 if sys.maxunicode > 0xffff:
1030 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1031 decoder = codecs.getdecoder("unicode_internal")
1032 ab = u"ab".encode("unicode_internal")
1033 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1034 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001035 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001036
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001037 def test_encode_length(self):
1038 # Issue 3739
1039 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001040 self.assertEqual(encoder(u"a")[1], 1)
1041 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001042
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001043 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001044 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001045
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1047nameprep_tests = [
1048 # 3.1 Map to nothing.
1049 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1050 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1051 '\xb8\x8f\xef\xbb\xbf',
1052 'foobarbaz'),
1053 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1054 ('CAFE',
1055 'cafe'),
1056 # 3.3 Case folding 8bit U+00DF (german sharp s).
1057 # The original test case is bogus; it says \xc3\xdf
1058 ('\xc3\x9f',
1059 'ss'),
1060 # 3.4 Case folding U+0130 (turkish capital I with dot).
1061 ('\xc4\xb0',
1062 'i\xcc\x87'),
1063 # 3.5 Case folding multibyte U+0143 U+037A.
1064 ('\xc5\x83\xcd\xba',
1065 '\xc5\x84 \xce\xb9'),
1066 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1067 # XXX: skip this as it fails in UCS-2 mode
1068 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1069 # 'telc\xe2\x88\x95kg\xcf\x83'),
1070 (None, None),
1071 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1072 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1073 '\xc7\xb0 a'),
1074 # 3.8 Case folding U+1FB7 and normalization.
1075 ('\xe1\xbe\xb7',
1076 '\xe1\xbe\xb6\xce\xb9'),
1077 # 3.9 Self-reverting case folding U+01F0 and normalization.
1078 # The original test case is bogus, it says `\xc7\xf0'
1079 ('\xc7\xb0',
1080 '\xc7\xb0'),
1081 # 3.10 Self-reverting case folding U+0390 and normalization.
1082 ('\xce\x90',
1083 '\xce\x90'),
1084 # 3.11 Self-reverting case folding U+03B0 and normalization.
1085 ('\xce\xb0',
1086 '\xce\xb0'),
1087 # 3.12 Self-reverting case folding U+1E96 and normalization.
1088 ('\xe1\xba\x96',
1089 '\xe1\xba\x96'),
1090 # 3.13 Self-reverting case folding U+1F56 and normalization.
1091 ('\xe1\xbd\x96',
1092 '\xe1\xbd\x96'),
1093 # 3.14 ASCII space character U+0020.
1094 (' ',
1095 ' '),
1096 # 3.15 Non-ASCII 8bit space character U+00A0.
1097 ('\xc2\xa0',
1098 ' '),
1099 # 3.16 Non-ASCII multibyte space character U+1680.
1100 ('\xe1\x9a\x80',
1101 None),
1102 # 3.17 Non-ASCII multibyte space character U+2000.
1103 ('\xe2\x80\x80',
1104 ' '),
1105 # 3.18 Zero Width Space U+200b.
1106 ('\xe2\x80\x8b',
1107 ''),
1108 # 3.19 Non-ASCII multibyte space character U+3000.
1109 ('\xe3\x80\x80',
1110 ' '),
1111 # 3.20 ASCII control characters U+0010 U+007F.
1112 ('\x10\x7f',
1113 '\x10\x7f'),
1114 # 3.21 Non-ASCII 8bit control character U+0085.
1115 ('\xc2\x85',
1116 None),
1117 # 3.22 Non-ASCII multibyte control character U+180E.
1118 ('\xe1\xa0\x8e',
1119 None),
1120 # 3.23 Zero Width No-Break Space U+FEFF.
1121 ('\xef\xbb\xbf',
1122 ''),
1123 # 3.24 Non-ASCII control character U+1D175.
1124 ('\xf0\x9d\x85\xb5',
1125 None),
1126 # 3.25 Plane 0 private use character U+F123.
1127 ('\xef\x84\xa3',
1128 None),
1129 # 3.26 Plane 15 private use character U+F1234.
1130 ('\xf3\xb1\x88\xb4',
1131 None),
1132 # 3.27 Plane 16 private use character U+10F234.
1133 ('\xf4\x8f\x88\xb4',
1134 None),
1135 # 3.28 Non-character code point U+8FFFE.
1136 ('\xf2\x8f\xbf\xbe',
1137 None),
1138 # 3.29 Non-character code point U+10FFFF.
1139 ('\xf4\x8f\xbf\xbf',
1140 None),
1141 # 3.30 Surrogate code U+DF42.
1142 ('\xed\xbd\x82',
1143 None),
1144 # 3.31 Non-plain text character U+FFFD.
1145 ('\xef\xbf\xbd',
1146 None),
1147 # 3.32 Ideographic description character U+2FF5.
1148 ('\xe2\xbf\xb5',
1149 None),
1150 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001151 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 '\xcc\x81'),
1153 # 3.34 Left-to-right mark U+200E.
1154 ('\xe2\x80\x8e',
1155 None),
1156 # 3.35 Deprecated U+202A.
1157 ('\xe2\x80\xaa',
1158 None),
1159 # 3.36 Language tagging character U+E0001.
1160 ('\xf3\xa0\x80\x81',
1161 None),
1162 # 3.37 Language tagging character U+E0042.
1163 ('\xf3\xa0\x81\x82',
1164 None),
1165 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1166 ('foo\xd6\xbebar',
1167 None),
1168 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1169 ('foo\xef\xb5\x90bar',
1170 None),
1171 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1172 ('foo\xef\xb9\xb6bar',
1173 'foo \xd9\x8ebar'),
1174 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1175 ('\xd8\xa71',
1176 None),
1177 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1178 ('\xd8\xa71\xd8\xa8',
1179 '\xd8\xa71\xd8\xa8'),
1180 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001181 # Skip this test as we allow unassigned
1182 #('\xf3\xa0\x80\x82',
1183 # None),
1184 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 # 3.44 Larger test (shrinking).
1186 # Original test case reads \xc3\xdf
1187 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1188 '\xaa\xce\xb0\xe2\x80\x80',
1189 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1190 # 3.45 Larger test (expanding).
1191 # Original test case reads \xc3\x9f
1192 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1193 '\x80',
1194 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1195 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1196 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1197 ]
1198
1199
1200class NameprepTest(unittest.TestCase):
1201 def test_nameprep(self):
1202 from encodings.idna import nameprep
1203 for pos, (orig, prepped) in enumerate(nameprep_tests):
1204 if orig is None:
1205 # Skipped
1206 continue
1207 # The Unicode strings are given in UTF-8
1208 orig = unicode(orig, "utf-8")
1209 if prepped is None:
1210 # Input contains prohibited characters
1211 self.assertRaises(UnicodeError, nameprep, orig)
1212 else:
1213 prepped = unicode(prepped, "utf-8")
1214 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001215 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 except Exception,e:
1217 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1218
Walter Dörwald78a0be62006-04-14 18:25:39 +00001219class IDNACodecTest(unittest.TestCase):
1220 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001221 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1222 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1223 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1224 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001225
1226 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001227 self.assertEqual(u"python.org".encode("idna"), "python.org")
1228 self.assertEqual("python.org.".encode("idna"), "python.org.")
1229 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1230 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001231
Martin v. Löwis8b595142005-08-25 11:03:38 +00001232 def test_stream(self):
1233 import StringIO
1234 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1235 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001236 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001237
Walter Dörwald78a0be62006-04-14 18:25:39 +00001238 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001239 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001240 "".join(codecs.iterdecode("python.org", "idna")),
1241 u"python.org"
1242 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001243 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001244 "".join(codecs.iterdecode("python.org.", "idna")),
1245 u"python.org."
1246 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001247 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001248 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1249 u"pyth\xf6n.org."
1250 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001251 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001252 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1253 u"pyth\xf6n.org."
1254 )
1255
1256 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001257 self.assertEqual(decoder.decode("xn--xam", ), u"")
1258 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1259 self.assertEqual(decoder.decode(u"rg"), u"")
1260 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001261
1262 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001263 self.assertEqual(decoder.decode("xn--xam", ), u"")
1264 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1265 self.assertEqual(decoder.decode("rg."), u"org.")
1266 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001267
1268 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001269 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001270 "".join(codecs.iterencode(u"python.org", "idna")),
1271 "python.org"
1272 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001273 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001274 "".join(codecs.iterencode(u"python.org.", "idna")),
1275 "python.org."
1276 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001277 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001278 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1279 "xn--pythn-mua.org."
1280 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001281 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001282 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1283 "xn--pythn-mua.org."
1284 )
1285
1286 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001287 self.assertEqual(encoder.encode(u"\xe4x"), "")
1288 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1289 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001290
1291 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001292 self.assertEqual(encoder.encode(u"\xe4x"), "")
1293 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1294 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001295
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001296class CodecsModuleTest(unittest.TestCase):
1297
1298 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001299 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001300 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001301 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001302 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001303 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1304
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001305 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001306 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001307 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001308 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001309 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001310 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001311 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1312
1313 def test_register(self):
1314 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001315 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001316
1317 def test_lookup(self):
1318 self.assertRaises(TypeError, codecs.lookup)
1319 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001320 self.assertRaises(LookupError, codecs.lookup, " ")
1321
1322 def test_getencoder(self):
1323 self.assertRaises(TypeError, codecs.getencoder)
1324 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1325
1326 def test_getdecoder(self):
1327 self.assertRaises(TypeError, codecs.getdecoder)
1328 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1329
1330 def test_getreader(self):
1331 self.assertRaises(TypeError, codecs.getreader)
1332 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1333
1334 def test_getwriter(self):
1335 self.assertRaises(TypeError, codecs.getwriter)
1336 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001337
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001338 def test_lookup_issue1813(self):
1339 # Issue #1813: under Turkish locales, lookup of some codecs failed
1340 # because 'I' is lowercased as a dotless "i"
1341 oldlocale = locale.getlocale(locale.LC_CTYPE)
1342 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1343 try:
1344 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1345 except locale.Error:
1346 # Unsupported locale on this system
1347 self.skipTest('test needs Turkish locale')
1348 c = codecs.lookup('ASCII')
1349 self.assertEqual(c.name, 'ascii')
1350
Serhiy Storchaka74a651b2014-12-20 17:42:24 +02001351 def test_all(self):
1352 api = (
1353 "encode", "decode",
1354 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1355 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1356 "getencoder", "getdecoder", "getincrementalencoder",
1357 "getincrementaldecoder", "getreader", "getwriter",
1358 "register_error", "lookup_error",
1359 "strict_errors", "replace_errors", "ignore_errors",
1360 "xmlcharrefreplace_errors", "backslashreplace_errors",
1361 "open", "EncodedFile",
1362 "iterencode", "iterdecode",
1363 "BOM", "BOM_BE", "BOM_LE",
1364 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1365 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1366 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1367 "StreamReaderWriter", "StreamRecoder",
1368 )
1369 self.assertEqual(sorted(api), sorted(codecs.__all__))
1370 for api in codecs.__all__:
1371 getattr(codecs, api)
1372
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001373class StreamReaderTest(unittest.TestCase):
1374
1375 def setUp(self):
1376 self.reader = codecs.getreader('utf-8')
1377 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1378
1379 def test_readlines(self):
1380 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001381 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001382
Georg Brandl8f99f812006-10-29 08:39:22 +00001383class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001384
Georg Brandl8f99f812006-10-29 08:39:22 +00001385 def test_basic(self):
1386 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001387 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001388 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001389
1390 f = StringIO.StringIO()
1391 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1392 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001393 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001394
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001395class Str2StrTest(unittest.TestCase):
1396
1397 def test_read(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001398 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001399 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1400 sout = reader.read()
1401 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001402 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001403
1404 def test_readline(self):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001405 sin = codecs.encode("\x80", "base64_codec")
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001406 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1407 sout = reader.readline()
1408 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001409 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001410
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001411all_unicode_encodings = [
1412 "ascii",
1413 "base64_codec",
1414 "big5",
1415 "big5hkscs",
1416 "charmap",
1417 "cp037",
1418 "cp1006",
1419 "cp1026",
1420 "cp1140",
1421 "cp1250",
1422 "cp1251",
1423 "cp1252",
1424 "cp1253",
1425 "cp1254",
1426 "cp1255",
1427 "cp1256",
1428 "cp1257",
1429 "cp1258",
1430 "cp424",
1431 "cp437",
1432 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001433 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001434 "cp737",
1435 "cp775",
1436 "cp850",
1437 "cp852",
1438 "cp855",
1439 "cp856",
1440 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001441 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001442 "cp860",
1443 "cp861",
1444 "cp862",
1445 "cp863",
1446 "cp864",
1447 "cp865",
1448 "cp866",
1449 "cp869",
1450 "cp874",
1451 "cp875",
1452 "cp932",
1453 "cp949",
1454 "cp950",
1455 "euc_jis_2004",
1456 "euc_jisx0213",
1457 "euc_jp",
1458 "euc_kr",
1459 "gb18030",
1460 "gb2312",
1461 "gbk",
1462 "hex_codec",
1463 "hp_roman8",
1464 "hz",
1465 "idna",
1466 "iso2022_jp",
1467 "iso2022_jp_1",
1468 "iso2022_jp_2",
1469 "iso2022_jp_2004",
1470 "iso2022_jp_3",
1471 "iso2022_jp_ext",
1472 "iso2022_kr",
1473 "iso8859_1",
1474 "iso8859_10",
1475 "iso8859_11",
1476 "iso8859_13",
1477 "iso8859_14",
1478 "iso8859_15",
1479 "iso8859_16",
1480 "iso8859_2",
1481 "iso8859_3",
1482 "iso8859_4",
1483 "iso8859_5",
1484 "iso8859_6",
1485 "iso8859_7",
1486 "iso8859_8",
1487 "iso8859_9",
1488 "johab",
1489 "koi8_r",
1490 "koi8_u",
1491 "latin_1",
1492 "mac_cyrillic",
1493 "mac_greek",
1494 "mac_iceland",
1495 "mac_latin2",
1496 "mac_roman",
1497 "mac_turkish",
1498 "palmos",
1499 "ptcp154",
1500 "punycode",
1501 "raw_unicode_escape",
1502 "rot_13",
1503 "shift_jis",
1504 "shift_jis_2004",
1505 "shift_jisx0213",
1506 "tis_620",
1507 "unicode_escape",
1508 "unicode_internal",
1509 "utf_16",
1510 "utf_16_be",
1511 "utf_16_le",
1512 "utf_7",
1513 "utf_8",
1514]
1515
1516if hasattr(codecs, "mbcs_encode"):
1517 all_unicode_encodings.append("mbcs")
1518
1519# The following encodings work only with str, not unicode
1520all_string_encodings = [
1521 "quopri_codec",
1522 "string_escape",
1523 "uu_codec",
1524]
1525
1526# The following encoding is not tested, because it's not supposed
1527# to work:
1528# "undefined"
1529
1530# The following encodings don't work in stateful mode
1531broken_unicode_with_streams = [
1532 "base64_codec",
1533 "hex_codec",
1534 "punycode",
1535 "unicode_internal"
1536]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001537broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001538
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03001539if sys.flags.py3k_warning:
1540 broken_unicode_with_streams.append("rot_13")
1541
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001542# The following encodings only support "strict" mode
1543only_strict_mode = [
1544 "idna",
1545 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001546 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001547]
1548
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001549try:
1550 import bz2
1551except ImportError:
1552 pass
1553else:
1554 all_unicode_encodings.append("bz2_codec")
1555 broken_unicode_with_streams.append("bz2_codec")
1556
1557try:
1558 import zlib
1559except ImportError:
1560 pass
1561else:
1562 all_unicode_encodings.append("zlib_codec")
1563 broken_unicode_with_streams.append("zlib_codec")
1564
1565class BasicUnicodeTest(unittest.TestCase):
1566 def test_basics(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001567 s = u"abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001568 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001569 name = codecs.lookup(encoding).name
1570 if encoding.endswith("_codec"):
1571 name += "_codec"
1572 elif encoding == "latin_1":
1573 name = "latin_1"
1574 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001575 (bytes, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001576 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001577 (chars, size) = codecs.getdecoder(encoding)(bytes)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001578 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001579
1580 if encoding not in broken_unicode_with_streams:
1581 # check stream reader/writer
1582 q = Queue()
1583 writer = codecs.getwriter(encoding)(q)
1584 encodedresult = ""
1585 for c in s:
1586 writer.write(c)
1587 encodedresult += q.read()
1588 q = Queue()
1589 reader = codecs.getreader(encoding)(q)
1590 decodedresult = u""
1591 for c in encodedresult:
1592 q.write(c)
1593 decodedresult += reader.read()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001594 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001595
Georg Brandl2c9838e2006-10-29 14:39:09 +00001596 if encoding not in broken_incremental_coders:
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001597 # check incremental decoder/encoder and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001598 try:
1599 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001600 except LookupError: # no IncrementalEncoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001601 pass
1602 else:
1603 # check incremental decoder/encoder
1604 encodedresult = ""
1605 for c in s:
1606 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001607 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001608 decoder = codecs.getincrementaldecoder(encoding)()
1609 decodedresult = u""
1610 for c in encodedresult:
1611 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001612 decodedresult += decoder.decode("", True)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001613 self.assertEqual(decodedresult, s,
1614 "encoding=%r" % encoding)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001615
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001616 # check iterencode()/iterdecode()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001617 result = u"".join(codecs.iterdecode(
1618 codecs.iterencode(s, encoding), encoding))
1619 self.assertEqual(result, s, "encoding=%r" % encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001620
1621 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001622 result = u"".join(codecs.iterdecode(
1623 codecs.iterencode(u"", encoding), encoding))
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001624 self.assertEqual(result, u"")
1625
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001626 if encoding not in only_strict_mode:
1627 # check incremental decoder/encoder with errors argument
1628 try:
1629 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001630 except LookupError: # no IncrementalEncoder
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001631 pass
1632 else:
1633 encodedresult = "".join(encoder.encode(c) for c in s)
1634 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001635 decodedresult = u"".join(decoder.decode(c)
1636 for c in encodedresult)
1637 self.assertEqual(decodedresult, s,
1638 "encoding=%r" % encoding)
Tim Petersabd8a332006-11-03 02:32:46 +00001639
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001640 @test_support.cpython_only
1641 def test_basics_capi(self):
1642 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1643 s = u"abc123" # all codecs should be able to encode these
1644 for encoding in all_unicode_encodings:
1645 if encoding not in broken_incremental_coders:
1646 # check incremental decoder/encoder and iterencode()/iterdecode()
1647 try:
1648 cencoder = codec_incrementalencoder(encoding)
1649 except LookupError: # no IncrementalEncoder
1650 pass
1651 else:
1652 # check C API
1653 encodedresult = ""
1654 for c in s:
1655 encodedresult += cencoder.encode(c)
1656 encodedresult += cencoder.encode(u"", True)
1657 cdecoder = codec_incrementaldecoder(encoding)
1658 decodedresult = u""
1659 for c in encodedresult:
1660 decodedresult += cdecoder.decode(c)
1661 decodedresult += cdecoder.decode("", True)
1662 self.assertEqual(decodedresult, s,
1663 "encoding=%r" % encoding)
1664
1665 if encoding not in only_strict_mode:
1666 # check incremental decoder/encoder with errors argument
1667 try:
1668 cencoder = codec_incrementalencoder(encoding, "ignore")
1669 except LookupError: # no IncrementalEncoder
1670 pass
1671 else:
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001672 encodedresult = "".join(cencoder.encode(c) for c in s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001673 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1674 decodedresult = u"".join(cdecoder.decode(c)
1675 for c in encodedresult)
1676 self.assertEqual(decodedresult, s,
1677 "encoding=%r" % encoding)
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001678
Walter Dörwald729c31f2005-03-14 19:06:30 +00001679 def test_seek(self):
1680 # all codecs should be able to encode these
1681 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1682 for encoding in all_unicode_encodings:
1683 if encoding == "idna": # FIXME: See SF bug #1163178
1684 continue
1685 if encoding in broken_unicode_with_streams:
1686 continue
1687 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1688 for t in xrange(5):
1689 # Test that calling seek resets the internal codec state and buffers
1690 reader.seek(0, 0)
1691 line = reader.readline()
1692 self.assertEqual(s[:len(line)], line)
1693
Walter Dörwalde22d3392005-11-17 08:52:34 +00001694 def test_bad_decode_args(self):
1695 for encoding in all_unicode_encodings:
1696 decoder = codecs.getdecoder(encoding)
1697 self.assertRaises(TypeError, decoder)
1698 if encoding not in ("idna", "punycode"):
1699 self.assertRaises(TypeError, decoder, 42)
1700
1701 def test_bad_encode_args(self):
1702 for encoding in all_unicode_encodings:
1703 encoder = codecs.getencoder(encoding)
1704 self.assertRaises(TypeError, encoder)
1705
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001706 def test_encoding_map_type_initialized(self):
1707 from encodings import cp1140
1708 # This used to crash, we are only verifying there's no crash.
1709 table_type = type(cp1140.encoding_table)
1710 self.assertEqual(table_type, table_type)
1711
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001712class BasicStrTest(unittest.TestCase):
1713 def test_basics(self):
1714 s = "abc123"
1715 for encoding in all_string_encodings:
1716 (bytes, size) = codecs.getencoder(encoding)(s)
1717 self.assertEqual(size, len(s))
1718 (chars, size) = codecs.getdecoder(encoding)(bytes)
1719 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1720
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001721class CharmapTest(unittest.TestCase):
1722 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001723 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001724 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1725 (u"abc", 3)
1726 )
1727
Serhiy Storchaka95997452013-01-15 14:42:59 +02001728 self.assertRaises(UnicodeDecodeError,
1729 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1730 )
1731
1732 self.assertRaises(UnicodeDecodeError,
1733 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1734 )
1735
Ezio Melotti2623a372010-11-21 13:34:58 +00001736 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001737 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1738 (u"ab\ufffd", 3)
1739 )
1740
Ezio Melotti2623a372010-11-21 13:34:58 +00001741 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001742 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1743 (u"ab\ufffd", 3)
1744 )
1745
Ezio Melotti2623a372010-11-21 13:34:58 +00001746 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001747 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1748 (u"ab", 3)
1749 )
1750
Ezio Melotti2623a372010-11-21 13:34:58 +00001751 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001752 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1753 (u"ab", 3)
1754 )
1755
1756 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001757 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001758 codecs.charmap_decode(allbytes, "ignore", u""),
1759 (u"", len(allbytes))
1760 )
1761
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001762 def test_decode_with_int2str_map(self):
1763 self.assertEqual(
1764 codecs.charmap_decode("\x00\x01\x02", "strict",
1765 {0: u'a', 1: u'b', 2: u'c'}),
1766 (u"abc", 3)
1767 )
1768
1769 self.assertEqual(
1770 codecs.charmap_decode("\x00\x01\x02", "strict",
1771 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1772 (u"AaBbCc", 3)
1773 )
1774
1775 self.assertEqual(
1776 codecs.charmap_decode("\x00\x01\x02", "strict",
1777 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1778 (u"\U0010FFFFbc", 3)
1779 )
1780
1781 self.assertEqual(
1782 codecs.charmap_decode("\x00\x01\x02", "strict",
1783 {0: u'a', 1: u'b', 2: u''}),
1784 (u"ab", 3)
1785 )
1786
1787 self.assertRaises(UnicodeDecodeError,
1788 codecs.charmap_decode, "\x00\x01\x02", "strict",
1789 {0: u'a', 1: u'b'}
1790 )
1791
Serhiy Storchaka95997452013-01-15 14:42:59 +02001792 self.assertRaises(UnicodeDecodeError,
1793 codecs.charmap_decode, "\x00\x01\x02", "strict",
1794 {0: u'a', 1: u'b', 2: None}
1795 )
1796
1797 # Issue #14850
1798 self.assertRaises(UnicodeDecodeError,
1799 codecs.charmap_decode, "\x00\x01\x02", "strict",
1800 {0: u'a', 1: u'b', 2: u'\ufffe'}
1801 )
1802
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001803 self.assertEqual(
1804 codecs.charmap_decode("\x00\x01\x02", "replace",
1805 {0: u'a', 1: u'b'}),
1806 (u"ab\ufffd", 3)
1807 )
1808
1809 self.assertEqual(
1810 codecs.charmap_decode("\x00\x01\x02", "replace",
1811 {0: u'a', 1: u'b', 2: None}),
1812 (u"ab\ufffd", 3)
1813 )
1814
Serhiy Storchaka95997452013-01-15 14:42:59 +02001815 # Issue #14850
1816 self.assertEqual(
1817 codecs.charmap_decode("\x00\x01\x02", "replace",
1818 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1819 (u"ab\ufffd", 3)
1820 )
1821
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001822 self.assertEqual(
1823 codecs.charmap_decode("\x00\x01\x02", "ignore",
1824 {0: u'a', 1: u'b'}),
1825 (u"ab", 3)
1826 )
1827
1828 self.assertEqual(
1829 codecs.charmap_decode("\x00\x01\x02", "ignore",
1830 {0: u'a', 1: u'b', 2: None}),
1831 (u"ab", 3)
1832 )
1833
Serhiy Storchaka95997452013-01-15 14:42:59 +02001834 # Issue #14850
1835 self.assertEqual(
1836 codecs.charmap_decode("\x00\x01\x02", "ignore",
1837 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1838 (u"ab", 3)
1839 )
1840
1841 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001842 self.assertEqual(
1843 codecs.charmap_decode(allbytes, "ignore", {}),
1844 (u"", len(allbytes))
1845 )
1846
1847 def test_decode_with_int2int_map(self):
1848 a = ord(u'a')
1849 b = ord(u'b')
1850 c = ord(u'c')
1851
1852 self.assertEqual(
1853 codecs.charmap_decode("\x00\x01\x02", "strict",
1854 {0: a, 1: b, 2: c}),
1855 (u"abc", 3)
1856 )
1857
1858 # Issue #15379
1859 self.assertEqual(
1860 codecs.charmap_decode("\x00\x01\x02", "strict",
1861 {0: 0x10FFFF, 1: b, 2: c}),
1862 (u"\U0010FFFFbc", 3)
1863 )
1864
1865 self.assertRaises(TypeError,
1866 codecs.charmap_decode, "\x00\x01\x02", "strict",
1867 {0: 0x110000, 1: b, 2: c}
1868 )
1869
1870 self.assertRaises(UnicodeDecodeError,
1871 codecs.charmap_decode, "\x00\x01\x02", "strict",
1872 {0: a, 1: b},
1873 )
1874
Serhiy Storchaka95997452013-01-15 14:42:59 +02001875 self.assertRaises(UnicodeDecodeError,
1876 codecs.charmap_decode, "\x00\x01\x02", "strict",
1877 {0: a, 1: b, 2: 0xFFFE},
1878 )
1879
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001880 self.assertEqual(
1881 codecs.charmap_decode("\x00\x01\x02", "replace",
1882 {0: a, 1: b}),
1883 (u"ab\ufffd", 3)
1884 )
1885
1886 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001887 codecs.charmap_decode("\x00\x01\x02", "replace",
1888 {0: a, 1: b, 2: 0xFFFE}),
1889 (u"ab\ufffd", 3)
1890 )
1891
1892 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001893 codecs.charmap_decode("\x00\x01\x02", "ignore",
1894 {0: a, 1: b}),
1895 (u"ab", 3)
1896 )
1897
Serhiy Storchaka95997452013-01-15 14:42:59 +02001898 self.assertEqual(
1899 codecs.charmap_decode("\x00\x01\x02", "ignore",
1900 {0: a, 1: b, 2: 0xFFFE}),
1901 (u"ab", 3)
1902 )
1903
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001904
Georg Brandl8f99f812006-10-29 08:39:22 +00001905class WithStmtTest(unittest.TestCase):
1906 def test_encodedfile(self):
1907 f = StringIO.StringIO("\xc3\xbc")
1908 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001909 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001910
1911 def test_streamreaderwriter(self):
1912 f = StringIO.StringIO("\xc3\xbc")
1913 info = codecs.lookup("utf-8")
1914 with codecs.StreamReaderWriter(f, info.streamreader,
1915 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001916 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001917
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001918
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001919class UnicodeEscapeTest(unittest.TestCase):
1920 def test_empty(self):
1921 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1922 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1923
1924 def test_raw_encode(self):
1925 encode = codecs.unicode_escape_encode
1926 for b in range(32, 127):
1927 if b != ord('\\'):
1928 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1929
1930 def test_raw_decode(self):
1931 decode = codecs.unicode_escape_decode
1932 for b in range(256):
1933 if b != ord('\\'):
1934 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1935
1936 def test_escape_encode(self):
1937 encode = codecs.unicode_escape_encode
1938 check = coding_checker(self, encode)
1939 check(u'\t', r'\t')
1940 check(u'\n', r'\n')
1941 check(u'\r', r'\r')
1942 check(u'\\', r'\\')
1943 for b in range(32):
1944 if chr(b) not in '\t\n\r':
1945 check(unichr(b), '\\x%02x' % b)
1946 for b in range(127, 256):
1947 check(unichr(b), '\\x%02x' % b)
1948 check(u'\u20ac', r'\u20ac')
1949 check(u'\U0001d120', r'\U0001d120')
1950
1951 def test_escape_decode(self):
1952 decode = codecs.unicode_escape_decode
1953 check = coding_checker(self, decode)
1954 check("[\\\n]", u"[]")
1955 check(r'[\"]', u'["]')
1956 check(r"[\']", u"[']")
1957 check(r"[\\]", ur"[\]")
1958 check(r"[\a]", u"[\x07]")
1959 check(r"[\b]", u"[\x08]")
1960 check(r"[\t]", u"[\x09]")
1961 check(r"[\n]", u"[\x0a]")
1962 check(r"[\v]", u"[\x0b]")
1963 check(r"[\f]", u"[\x0c]")
1964 check(r"[\r]", u"[\x0d]")
1965 check(r"[\7]", u"[\x07]")
1966 check(r"[\8]", ur"[\8]")
1967 check(r"[\78]", u"[\x078]")
1968 check(r"[\41]", u"[!]")
1969 check(r"[\418]", u"[!8]")
1970 check(r"[\101]", u"[A]")
1971 check(r"[\1010]", u"[A0]")
1972 check(r"[\x41]", u"[A]")
1973 check(r"[\x410]", u"[A0]")
1974 check(r"\u20ac", u"\u20ac")
1975 check(r"\U0001d120", u"\U0001d120")
1976 for b in range(256):
1977 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1978 check('\\' + chr(b), u'\\' + unichr(b))
1979
1980 def test_decode_errors(self):
1981 decode = codecs.unicode_escape_decode
1982 for c, d in ('x', 2), ('u', 4), ('U', 4):
1983 for i in range(d):
1984 self.assertRaises(UnicodeDecodeError, decode,
1985 "\\" + c + "0"*i)
1986 self.assertRaises(UnicodeDecodeError, decode,
1987 "[\\" + c + "0"*i + "]")
1988 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1989 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1990 self.assertEqual(decode(data, "replace"),
1991 (u"[\ufffd]\ufffd", len(data)))
1992 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1993 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1994 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1995
1996
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001997class RawUnicodeEscapeTest(unittest.TestCase):
1998 def test_empty(self):
1999 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
2000 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
2001
2002 def test_raw_encode(self):
2003 encode = codecs.raw_unicode_escape_encode
2004 for b in range(256):
2005 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2006
2007 def test_raw_decode(self):
2008 decode = codecs.raw_unicode_escape_decode
2009 for b in range(256):
2010 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2011
2012 def test_escape_encode(self):
2013 encode = codecs.raw_unicode_escape_encode
2014 check = coding_checker(self, encode)
2015 for b in range(256):
2016 if chr(b) not in 'uU':
2017 check(u'\\' + unichr(b), '\\' + chr(b))
2018 check(u'\u20ac', r'\u20ac')
2019 check(u'\U0001d120', r'\U0001d120')
2020
2021 def test_escape_decode(self):
2022 decode = codecs.raw_unicode_escape_decode
2023 check = coding_checker(self, decode)
2024 for b in range(256):
2025 if chr(b) not in 'uU':
2026 check('\\' + chr(b), u'\\' + unichr(b))
2027 check(r"\u20ac", u"\u20ac")
2028 check(r"\U0001d120", u"\U0001d120")
2029
2030 def test_decode_errors(self):
2031 decode = codecs.raw_unicode_escape_decode
2032 for c, d in ('u', 4), ('U', 4):
2033 for i in range(d):
2034 self.assertRaises(UnicodeDecodeError, decode,
2035 "\\" + c + "0"*i)
2036 self.assertRaises(UnicodeDecodeError, decode,
2037 "[\\" + c + "0"*i + "]")
2038 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2039 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2040 self.assertEqual(decode(data, "replace"),
2041 (u"[\ufffd]\ufffd", len(data)))
2042 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2043 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2044 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2045
2046
Victor Stinner262be5e2010-05-22 02:11:07 +00002047class BomTest(unittest.TestCase):
2048 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002049 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002050 tests = ("utf-16",
2051 "utf-16-le",
2052 "utf-16-be",
2053 "utf-32",
2054 "utf-32-le",
2055 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002056 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002057 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002058 # Check if the BOM is written only once
2059 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002060 f.write(data)
2061 f.write(data)
2062 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002063 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002064 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002065 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002066
Victor Stinner7df55da2010-05-22 13:37:56 +00002067 # Check that the BOM is written after a seek(0)
2068 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2069 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002070 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002071 f.seek(0)
2072 f.write(data)
2073 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002074 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002075
2076 # (StreamWriter) Check that the BOM is written after a seek(0)
2077 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2078 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002079 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002080 f.writer.seek(0)
2081 f.writer.write(data)
2082 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002083 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002084
2085 # Check that the BOM is not written after a seek() at a position
2086 # different than the start
2087 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2088 f.write(data)
2089 f.seek(f.tell())
2090 f.write(data)
2091 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002092 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002093
2094 # (StreamWriter) Check that the BOM is not written after a seek()
2095 # at a position different than the start
2096 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2097 f.writer.write(data)
2098 f.writer.seek(f.writer.tell())
2099 f.writer.write(data)
2100 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002101 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002102
Victor Stinner262be5e2010-05-22 02:11:07 +00002103
Fred Drake2e2be372001-09-20 21:33:42 +00002104def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002105 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002106 UTF32Test,
2107 UTF32LETest,
2108 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002109 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002110 UTF16LETest,
2111 UTF16BETest,
2112 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002113 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002114 UTF7Test,
2115 UTF16ExTest,
2116 ReadBufferTest,
2117 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002118 EscapeDecodeTest,
2119 RecodingTest,
2120 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002121 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002122 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002123 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002124 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002125 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002126 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002127 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002128 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002129 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002130 CharmapTest,
2131 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002132 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002133 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002134 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002135 )
Fred Drake2e2be372001-09-20 21:33:42 +00002136
Serhiy Storchakab4f3d802014-11-07 14:07:43 +02002137 def test_uu_invalid(self):
2138 # Missing "begin" line
2139 self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2140
Serhiy Storchakac7797dc2015-05-31 20:21:00 +03002141 def test_text_to_binary_blacklists_binary_transforms(self):
2142 # Check binary -> binary codecs give a good error for str input
2143 bad_input = "bad input type"
2144 for encoding in bytes_transform_encodings:
2145 fmt = (r"{!r} is not a text encoding; "
2146 r"use codecs.encode\(\) to handle arbitrary codecs")
2147 msg = fmt.format(encoding)
2148 with self.assertRaisesRegex(LookupError, msg) as failure:
2149 bad_input.encode(encoding)
2150 self.assertIsNone(failure.exception.__cause__)
2151
2152 def test_text_to_binary_blacklists_text_transforms(self):
2153 # Check str.encode gives a good error message for str -> str codecs
2154 msg = (r"^'rot_13' is not a text encoding; "
2155 r"use codecs.encode\(\) to handle arbitrary codecs")
2156 with self.assertRaisesRegex(LookupError, msg):
2157 "just an example message".encode("rot_13")
2158
2159 def test_binary_to_text_blacklists_binary_transforms(self):
2160 # Check bytes.decode and bytearray.decode give a good error
2161 # message for binary -> binary codecs
2162 data = b"encode first to ensure we meet any format restrictions"
2163 for encoding in bytes_transform_encodings:
2164 encoded_data = codecs.encode(data, encoding)
2165 fmt = (r"{!r} is not a text encoding; "
2166 r"use codecs.decode\(\) to handle arbitrary codecs")
2167 msg = fmt.format(encoding)
2168 with self.assertRaisesRegex(LookupError, msg):
2169 encoded_data.decode(encoding)
2170 with self.assertRaisesRegex(LookupError, msg):
2171 bytearray(encoded_data).decode(encoding)
2172
2173 def test_binary_to_text_blacklists_text_transforms(self):
2174 # Check str -> str codec gives a good error for binary input
2175 for bad_input in (b"immutable", bytearray(b"mutable")):
2176 msg = (r"^'rot_13' is not a text encoding; "
2177 r"use codecs.decode\(\) to handle arbitrary codecs")
2178 with self.assertRaisesRegex(LookupError, msg) as failure:
2179 bad_input.decode("rot_13")
2180 self.assertIsNone(failure.exception.__cause__)
2181
Fred Drake2e2be372001-09-20 21:33:42 +00002182
2183if __name__ == "__main__":
2184 test_main()