blob: de80b0776c8116e806ee57f653a6bff18fbf2d63 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02005import sys, StringIO
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
152 # Issue #16636: Test readline() followed by readlines()
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.readlines(), lines[1:])
156 self.assertEqual(f.read(), '')
157
158 # Test read() followed by read()
159 f = getreader()
160 self.assertEqual(f.read(size=40, chars=5), data[:5])
161 self.assertEqual(f.read(), data[5:])
162 self.assertEqual(f.read(), '')
163
164 # Issue #12446: Test read() followed by readlines()
165 f = getreader()
166 self.assertEqual(f.read(size=40, chars=5), data[:5])
167 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
168 self.assertEqual(f.read(), '')
169
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 def test_bug1175396(self):
171 s = [
172 '<%!--===================================================\r\n',
173 ' BLOG index page: show recent articles,\r\n',
174 ' today\'s articles, or articles of a specific date.\r\n',
175 '========================================================--%>\r\n',
176 '<%@inputencoding="ISO-8859-1"%>\r\n',
177 '<%@pagetemplate=TEMPLATE.y%>\r\n',
178 '<%@import=import frog.util, frog%>\r\n',
179 '<%@import=import frog.objects%>\r\n',
180 '<%@import=from frog.storageerrors import StorageError%>\r\n',
181 '<%\r\n',
182 '\r\n',
183 'import logging\r\n',
184 'log=logging.getLogger("Snakelets.logger")\r\n',
185 '\r\n',
186 '\r\n',
187 'user=self.SessionCtx.user\r\n',
188 'storageEngine=self.SessionCtx.storageEngine\r\n',
189 '\r\n',
190 '\r\n',
191 'def readArticlesFromDate(date, count=None):\r\n',
192 ' entryids=storageEngine.listBlogEntries(date)\r\n',
193 ' entryids.reverse() # descending\r\n',
194 ' if count:\r\n',
195 ' entryids=entryids[:count]\r\n',
196 ' try:\r\n',
197 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
198 ' except StorageError,x:\r\n',
199 ' log.error("Error loading articles: "+str(x))\r\n',
200 ' self.abort("cannot load articles")\r\n',
201 '\r\n',
202 'showdate=None\r\n',
203 '\r\n',
204 'arg=self.Request.getArg()\r\n',
205 'if arg=="today":\r\n',
206 ' #-------------------- TODAY\'S ARTICLES\r\n',
207 ' self.write("<h2>Today\'s articles</h2>")\r\n',
208 ' showdate = frog.util.isodatestr() \r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'elif arg=="active":\r\n',
211 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
212 ' self.Yredirect("active.y")\r\n',
213 'elif arg=="login":\r\n',
214 ' #-------------------- LOGIN PAGE redirect\r\n',
215 ' self.Yredirect("login.y")\r\n',
216 'elif arg=="date":\r\n',
217 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
218 ' showdate = self.Request.getParameter("date")\r\n',
219 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
220 ' entries = readArticlesFromDate(showdate)\r\n',
221 'else:\r\n',
222 ' #-------------------- RECENT ARTICLES\r\n',
223 ' self.write("<h2>Recent articles</h2>")\r\n',
224 ' dates=storageEngine.listBlogEntryDates()\r\n',
225 ' if dates:\r\n',
226 ' entries=[]\r\n',
227 ' SHOWAMOUNT=10\r\n',
228 ' for showdate in dates:\r\n',
229 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
230 ' if len(entries)>=SHOWAMOUNT:\r\n',
231 ' break\r\n',
232 ' \r\n',
233 ]
234 stream = StringIO.StringIO("".join(s).encode(self.encoding))
235 reader = codecs.getreader(self.encoding)(stream)
236 for (i, line) in enumerate(reader):
237 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 def test_readlinequeue(self):
240 q = Queue()
241 writer = codecs.getwriter(self.encoding)(q)
242 reader = codecs.getreader(self.encoding)(q)
243
244 # No lineends
245 writer.write(u"foo\r")
246 self.assertEqual(reader.readline(keepends=False), u"foo")
247 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000248 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249 self.assertEqual(reader.readline(keepends=False), u"bar")
250 writer.write(u"baz")
251 self.assertEqual(reader.readline(keepends=False), u"baz")
252 self.assertEqual(reader.readline(keepends=False), u"")
253
254 # Lineends
255 writer.write(u"foo\r")
256 self.assertEqual(reader.readline(keepends=True), u"foo\r")
257 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000258 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259 self.assertEqual(reader.readline(keepends=True), u"bar\r")
260 writer.write(u"baz")
261 self.assertEqual(reader.readline(keepends=True), u"baz")
262 self.assertEqual(reader.readline(keepends=True), u"")
263 writer.write(u"foo\r\n")
264 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
265
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 def test_bug1098990_a(self):
267 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
268 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
269 s3 = u"next line.\r\n"
270
271 s = (s1+s2+s3).encode(self.encoding)
272 stream = StringIO.StringIO(s)
273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), u"")
278
279 def test_bug1098990_b(self):
280 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
281 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
282 s3 = u"stillokay:bbbbxx\r\n"
283 s4 = u"broken!!!!badbad\r\n"
284 s5 = u"againokay.\r\n"
285
286 s = (s1+s2+s3+s4+s5).encode(self.encoding)
287 stream = StringIO.StringIO(s)
288 reader = codecs.getreader(self.encoding)(stream)
289 self.assertEqual(reader.readline(), s1)
290 self.assertEqual(reader.readline(), s2)
291 self.assertEqual(reader.readline(), s3)
292 self.assertEqual(reader.readline(), s4)
293 self.assertEqual(reader.readline(), s5)
294 self.assertEqual(reader.readline(), u"")
295
Walter Dörwald6e390802007-08-17 16:41:28 +0000296class UTF32Test(ReadTest):
297 encoding = "utf-32"
298
299 spamle = ('\xff\xfe\x00\x00'
300 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
301 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
302 spambe = ('\x00\x00\xfe\xff'
303 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
304 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
305
306 def test_only_one_bom(self):
307 _,_,reader,writer = codecs.lookup(self.encoding)
308 # encode some stream
309 s = StringIO.StringIO()
310 f = writer(s)
311 f.write(u"spam")
312 f.write(u"spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000316 # try to read it back
317 s = StringIO.StringIO(d)
318 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000319 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000320
321 def test_badbom(self):
322 s = StringIO.StringIO(4*"\xff")
323 f = codecs.getreader(self.encoding)(s)
324 self.assertRaises(UnicodeError, f.read)
325
326 s = StringIO.StringIO(8*"\xff")
327 f = codecs.getreader(self.encoding)(s)
328 self.assertRaises(UnicodeError, f.read)
329
330 def test_partial(self):
331 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200332 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000333 [
334 u"", # first byte of BOM read
335 u"", # second byte of BOM read
336 u"", # third byte of BOM read
337 u"", # fourth byte of BOM read => byteorder known
338 u"",
339 u"",
340 u"",
341 u"\x00",
342 u"\x00",
343 u"\x00",
344 u"\x00",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200354 u"\x00\xff\u0100\uffff",
355 u"\x00\xff\u0100\uffff",
356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 ]
359 )
360
Georg Brandle9741f32009-09-17 11:28:09 +0000361 def test_handlers(self):
362 self.assertEqual((u'\ufffd', 1),
363 codecs.utf_32_decode('\x01', 'replace', True))
364 self.assertEqual((u'', 1),
365 codecs.utf_32_decode('\x01', 'ignore', True))
366
Walter Dörwald6e390802007-08-17 16:41:28 +0000367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
369 "\xff", "strict", True)
370
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000371 def test_issue8941(self):
372 # Issue #8941: insufficient result allocation when decoding into
373 # surrogate pairs on UCS-2 builds.
374 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
375 self.assertEqual(u'\U00010000' * 1024,
376 codecs.utf_32_decode(encoded_le)[0])
377 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
378 self.assertEqual(u'\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_be)[0])
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381class UTF32LETest(ReadTest):
382 encoding = "utf-32-le"
383
384 def test_partial(self):
385 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200386 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000387 [
388 u"",
389 u"",
390 u"",
391 u"\x00",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00\xff",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff\u0100",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000408 ]
409 )
410
411 def test_simple(self):
412 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
416 "\xff", "strict", True)
417
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000418 def test_issue8941(self):
419 # Issue #8941: insufficient result allocation when decoding into
420 # surrogate pairs on UCS-2 builds.
421 encoded = '\x00\x00\x01\x00' * 1024
422 self.assertEqual(u'\U00010000' * 1024,
423 codecs.utf_32_le_decode(encoded)[0])
424
Walter Dörwald6e390802007-08-17 16:41:28 +0000425class UTF32BETest(ReadTest):
426 encoding = "utf-32-be"
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200430 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000431 [
432 u"",
433 u"",
434 u"",
435 u"\x00",
436 u"\x00",
437 u"\x00",
438 u"\x00",
439 u"\x00\xff",
440 u"\x00\xff",
441 u"\x00\xff",
442 u"\x00\xff",
443 u"\x00\xff\u0100",
444 u"\x00\xff\u0100",
445 u"\x00\xff\u0100",
446 u"\x00\xff\u0100",
447 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200448 u"\x00\xff\u0100\uffff",
449 u"\x00\xff\u0100\uffff",
450 u"\x00\xff\u0100\uffff",
451 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 "\xff", "strict", True)
461
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = '\x00\x01\x00\x00' * 1024
466 self.assertEqual(u'\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
473 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
479 s = StringIO.StringIO()
480 f = writer(s)
481 f.write(u"spam")
482 f.write(u"spam")
483 d = s.getvalue()
484 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000485 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486 # try to read it back
487 s = StringIO.StringIO(d)
488 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000489 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000491 def test_badbom(self):
492 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000493 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000494 self.assertRaises(UnicodeError, f.read)
495
496 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000497 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 self.assertRaises(UnicodeError, f.read)
499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def test_partial(self):
501 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200502 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000503 [
504 u"", # first byte of BOM read
505 u"", # second byte of BOM read => byteorder known
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000518 ]
519 )
520
Georg Brandle9741f32009-09-17 11:28:09 +0000521 def test_handlers(self):
522 self.assertEqual((u'\ufffd', 1),
523 codecs.utf_16_decode('\x01', 'replace', True))
524 self.assertEqual((u'', 1),
525 codecs.utf_16_decode('\x01', 'ignore', True))
526
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527 def test_errors(self):
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
529
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000530 def test_bug691291(self):
531 # Files are always opened in binary mode, even if no binary mode was
532 # specified. This means that no automatic conversion of '\n' is done
533 # on reading and writing.
534 s1 = u'Hello\r\nworld\r\n'
535
536 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200537 self.addCleanup(test_support.unlink, test_support.TESTFN)
538 with open(test_support.TESTFN, 'wb') as fp:
539 fp.write(s)
540 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
541 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543class UTF16LETest(ReadTest):
544 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200548 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000549 [
550 u"",
551 u"\x00",
552 u"\x00",
553 u"\x00\xff",
554 u"\x00\xff",
555 u"\x00\xff\u0100",
556 u"\x00\xff\u0100",
557 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200558 u"\x00\xff\u0100\uffff",
559 u"\x00\xff\u0100\uffff",
560 u"\x00\xff\u0100\uffff",
561 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200566 tests = [
567 (b'\xff', u'\ufffd'),
568 (b'A\x00Z', u'A\ufffd'),
569 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
570 (b'\x00\xd8', u'\ufffd'),
571 (b'\x00\xd8A', u'\ufffd'),
572 (b'\x00\xd8A\x00', u'\ufffdA'),
573 (b'\x00\xdcA\x00', u'\ufffdA'),
574 ]
575 for raw, expected in tests:
576 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
577 raw, 'strict', True)
578 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200585 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
587 u"",
588 u"\x00",
589 u"\x00",
590 u"\x00\xff",
591 u"\x00\xff",
592 u"\x00\xff\u0100",
593 u"\x00\xff\u0100",
594 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200595 u"\x00\xff\u0100\uffff",
596 u"\x00\xff\u0100\uffff",
597 u"\x00\xff\u0100\uffff",
598 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 ]
600 )
601
Walter Dörwalde22d3392005-11-17 08:52:34 +0000602 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200603 tests = [
604 (b'\xff', u'\ufffd'),
605 (b'\x00A\xff', u'A\ufffd'),
606 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
607 (b'\xd8\x00', u'\ufffd'),
608 (b'\xd8\x00\xdc', u'\ufffd'),
609 (b'\xd8\x00\x00A', u'\ufffdA'),
610 (b'\xdc\x00\x00A', u'\ufffdA'),
611 ]
612 for raw, expected in tests:
613 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
614 raw, 'strict', True)
615 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000617class UTF8Test(ReadTest):
618 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000619
620 def test_partial(self):
621 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200622 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000623 [
624 u"\x00",
625 u"\x00",
626 u"\x00\xff",
627 u"\x00\xff",
628 u"\x00\xff\u07ff",
629 u"\x00\xff\u07ff",
630 u"\x00\xff\u07ff",
631 u"\x00\xff\u07ff\u0800",
632 u"\x00\xff\u07ff\u0800",
633 u"\x00\xff\u07ff\u0800",
634 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200635 u"\x00\xff\u07ff\u0800\uffff",
636 u"\x00\xff\u07ff\u0800\uffff",
637 u"\x00\xff\u07ff\u0800\uffff",
638 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000639 ]
640 )
641
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642class UTF7Test(ReadTest):
643 encoding = "utf-7"
644
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000645 def test_partial(self):
646 self.check_partial(
647 u"a+-b",
648 [
649 u"a",
650 u"a",
651 u"a+",
652 u"a+-",
653 u"a+-b",
654 ]
655 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000656
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300657 def test_errors(self):
658 tests = [
659 ('a\xffb', u'a\ufffdb'),
660 ('a+IK', u'a\ufffd'),
661 ('a+IK-b', u'a\ufffdb'),
662 ('a+IK,b', u'a\ufffdb'),
663 ('a+IKx', u'a\u20ac\ufffd'),
664 ('a+IKx-b', u'a\u20ac\ufffdb'),
665 ('a+IKwgr', u'a\u20ac\ufffd'),
666 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
667 ('a+IKwgr,', u'a\u20ac\ufffd'),
668 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
669 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
670 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
671 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
672 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
673 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
674 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
675 ]
676 for raw, expected in tests:
677 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
678 raw, 'strict', True)
679 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
680
681 def test_nonbmp(self):
682 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
683 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
684 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
685
Walter Dörwalde22d3392005-11-17 08:52:34 +0000686class UTF16ExTest(unittest.TestCase):
687
688 def test_errors(self):
689 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
690
691 def test_bad_args(self):
692 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
693
694class ReadBufferTest(unittest.TestCase):
695
696 def test_array(self):
697 import array
698 self.assertEqual(
699 codecs.readbuffer_encode(array.array("c", "spam")),
700 ("spam", 4)
701 )
702
703 def test_empty(self):
704 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
705
706 def test_bad_args(self):
707 self.assertRaises(TypeError, codecs.readbuffer_encode)
708 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
709
710class CharBufferTest(unittest.TestCase):
711
712 def test_string(self):
713 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
714
715 def test_empty(self):
716 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
717
718 def test_bad_args(self):
719 self.assertRaises(TypeError, codecs.charbuffer_encode)
720 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
721
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000722class UTF8SigTest(ReadTest):
723 encoding = "utf-8-sig"
724
725 def test_partial(self):
726 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200727 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000728 [
729 u"",
730 u"",
731 u"", # First BOM has been read and skipped
732 u"",
733 u"",
734 u"\ufeff", # Second BOM has been read and emitted
735 u"\ufeff\x00", # "\x00" read and emitted
736 u"\ufeff\x00", # First byte of encoded u"\xff" read
737 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
738 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
739 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
740 u"\ufeff\x00\xff\u07ff",
741 u"\ufeff\x00\xff\u07ff",
742 u"\ufeff\x00\xff\u07ff\u0800",
743 u"\ufeff\x00\xff\u07ff\u0800",
744 u"\ufeff\x00\xff\u07ff\u0800",
745 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200746 u"\ufeff\x00\xff\u07ff\u0800\uffff",
747 u"\ufeff\x00\xff\u07ff\u0800\uffff",
748 u"\ufeff\x00\xff\u07ff\u0800\uffff",
749 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000750 ]
751 )
752
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000753 def test_bug1601501(self):
754 # SF bug #1601501: check that the codec works with a buffer
755 unicode("\xef\xbb\xbf", "utf-8-sig")
756
Walter Dörwald42348272007-04-12 10:35:00 +0000757 def test_bom(self):
758 d = codecs.getincrementaldecoder("utf-8-sig")()
759 s = u"spam"
760 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
761
Walter Dörwald183744d2007-11-19 12:41:10 +0000762 def test_stream_bom(self):
763 unistring = u"ABC\u00A1\u2200XYZ"
764 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
765
766 reader = codecs.getreader("utf-8-sig")
767 for sizehint in [None] + range(1, 11) + \
768 [64, 128, 256, 512, 1024]:
769 istream = reader(StringIO.StringIO(bytestring))
770 ostream = StringIO.StringIO()
771 while 1:
772 if sizehint is not None:
773 data = istream.read(sizehint)
774 else:
775 data = istream.read()
776
777 if not data:
778 break
779 ostream.write(data)
780
781 got = ostream.getvalue()
782 self.assertEqual(got, unistring)
783
784 def test_stream_bare(self):
785 unistring = u"ABC\u00A1\u2200XYZ"
786 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
787
788 reader = codecs.getreader("utf-8-sig")
789 for sizehint in [None] + range(1, 11) + \
790 [64, 128, 256, 512, 1024]:
791 istream = reader(StringIO.StringIO(bytestring))
792 ostream = StringIO.StringIO()
793 while 1:
794 if sizehint is not None:
795 data = istream.read(sizehint)
796 else:
797 data = istream.read()
798
799 if not data:
800 break
801 ostream.write(data)
802
803 got = ostream.getvalue()
804 self.assertEqual(got, unistring)
805
Walter Dörwald8709a422002-09-03 13:53:40 +0000806class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000807 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000808 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000809
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200810 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200811 decode = codecs.escape_decode
812 for b in range(256):
813 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200814 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200815 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200816
817 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200818 decode = codecs.escape_decode
819 check = coding_checker(self, decode)
820 check(b"[\\\n]", b"[]")
821 check(br'[\"]', b'["]')
822 check(br"[\']", b"[']")
823 check(br"[\\]", br"[\]")
824 check(br"[\a]", b"[\x07]")
825 check(br"[\b]", b"[\x08]")
826 check(br"[\t]", b"[\x09]")
827 check(br"[\n]", b"[\x0a]")
828 check(br"[\v]", b"[\x0b]")
829 check(br"[\f]", b"[\x0c]")
830 check(br"[\r]", b"[\x0d]")
831 check(br"[\7]", b"[\x07]")
832 check(br"[\8]", br"[\8]")
833 check(br"[\78]", b"[\x078]")
834 check(br"[\41]", b"[!]")
835 check(br"[\418]", b"[!8]")
836 check(br"[\101]", b"[A]")
837 check(br"[\1010]", b"[A0]")
838 check(br"[\501]", b"[A]")
839 check(br"[\x41]", b"[A]")
840 check(br"[\X41]", br"[\X41]")
841 check(br"[\x410]", b"[A0]")
842 for b in range(256):
843 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200844 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200845 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200846
847 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200848 decode = codecs.escape_decode
849 self.assertRaises(ValueError, decode, br"\x")
850 self.assertRaises(ValueError, decode, br"[\x]")
851 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
852 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
853 self.assertRaises(ValueError, decode, br"\x0")
854 self.assertRaises(ValueError, decode, br"[\x0]")
855 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
856 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200857
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000858class RecodingTest(unittest.TestCase):
859 def test_recoding(self):
860 f = StringIO.StringIO()
861 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
862 f2.write(u"a")
863 f2.close()
864 # Python used to crash on this at exit because of a refcount
865 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000866
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867# From RFC 3492
868punycode_testcases = [
869 # A Arabic (Egyptian):
870 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
871 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
872 "egbpdaj6bu4bxfgehfvwxn"),
873 # B Chinese (simplified):
874 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
875 "ihqwcrb4cv8a8dqg056pqjye"),
876 # C Chinese (traditional):
877 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
878 "ihqwctvzc91f659drss3x8bo0yb"),
879 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
880 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
881 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
882 u"\u0065\u0073\u006B\u0079",
883 "Proprostnemluvesky-uyb24dma41a"),
884 # E Hebrew:
885 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
886 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
887 u"\u05D1\u05E8\u05D9\u05EA",
888 "4dbcagdahymbxekheh6e0a7fei0b"),
889 # F Hindi (Devanagari):
890 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
891 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
892 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
893 u"\u0939\u0948\u0902",
894 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
895
896 #(G) Japanese (kanji and hiragana):
897 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
898 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
899 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
900
901 # (H) Korean (Hangul syllables):
902 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
903 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
904 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
905 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
906 "psd879ccm6fea98c"),
907
908 # (I) Russian (Cyrillic):
909 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
910 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
911 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
912 u"\u0438",
913 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
914
915 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
916 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
917 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
918 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
919 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
920 u"\u0061\u00F1\u006F\u006C",
921 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
922
923 # (K) Vietnamese:
924 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
925 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
926 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
927 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
928 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
929 u"\u0056\u0069\u1EC7\u0074",
930 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932 #(L) 3<nen>B<gumi><kinpachi><sensei>
933 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
934 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000935
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936 # (M) <amuro><namie>-with-SUPER-MONKEYS
937 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
938 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
939 u"\u004F\u004E\u004B\u0045\u0059\u0053",
940 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
941
942 # (N) Hello-Another-Way-<sorezore><no><basho>
943 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
944 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
945 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
946 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
947
948 # (O) <hitotsu><yane><no><shita>2
949 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
950 "2-u9tlzr9756bt3uc0v"),
951
952 # (P) Maji<de>Koi<suru>5<byou><mae>
953 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
954 u"\u308B\u0035\u79D2\u524D",
955 "MajiKoi5-783gue6qz075azm5e"),
956
957 # (Q) <pafii>de<runba>
958 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
959 "de-jg4avhby1noc0d"),
960
961 # (R) <sono><supiido><de>
962 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
963 "d9juau41awczczp"),
964
965 # (S) -> $1.00 <-
966 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
967 u"\u003C\u002D",
968 "-> $1.00 <--")
969 ]
970
971for i in punycode_testcases:
972 if len(i)!=2:
973 print repr(i)
974
975class PunycodeTest(unittest.TestCase):
976 def test_encode(self):
977 for uni, puny in punycode_testcases:
978 # Need to convert both strings to lower case, since
979 # some of the extended encodings use upper case, but our
980 # code produces only lower case. Converting just puny to
981 # lower is also insufficient, since some of the input characters
982 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000983 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984
985 def test_decode(self):
986 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000987 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000989class UnicodeInternalTest(unittest.TestCase):
990 def test_bug1251300(self):
991 # Decoding with unicode_internal used to not correctly handle "code
992 # points" above 0x10ffff on UCS-4 builds.
993 if sys.maxunicode > 0xffff:
994 ok = [
995 ("\x00\x10\xff\xff", u"\U0010ffff"),
996 ("\x00\x00\x01\x01", u"\U00000101"),
997 ("", u""),
998 ]
999 not_ok = [
1000 "\x7f\xff\xff\xff",
1001 "\x80\x00\x00\x00",
1002 "\x81\x00\x00\x00",
1003 "\x00",
1004 "\x00\x00\x00\x00\x00",
1005 ]
1006 for internal, uni in ok:
1007 if sys.byteorder == "little":
1008 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001009 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001010 for internal in not_ok:
1011 if sys.byteorder == "little":
1012 internal = "".join(reversed(internal))
1013 self.assertRaises(UnicodeDecodeError, internal.decode,
1014 "unicode_internal")
1015
1016 def test_decode_error_attributes(self):
1017 if sys.maxunicode > 0xffff:
1018 try:
1019 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1020 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001021 self.assertEqual("unicode_internal", ex.encoding)
1022 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1023 self.assertEqual(4, ex.start)
1024 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001025 else:
1026 self.fail()
1027
1028 def test_decode_callback(self):
1029 if sys.maxunicode > 0xffff:
1030 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1031 decoder = codecs.getdecoder("unicode_internal")
1032 ab = u"ab".encode("unicode_internal")
1033 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1034 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001035 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001036
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001037 def test_encode_length(self):
1038 # Issue 3739
1039 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001040 self.assertEqual(encoder(u"a")[1], 1)
1041 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001042
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001043 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001044 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001045
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1047nameprep_tests = [
1048 # 3.1 Map to nothing.
1049 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1050 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1051 '\xb8\x8f\xef\xbb\xbf',
1052 'foobarbaz'),
1053 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1054 ('CAFE',
1055 'cafe'),
1056 # 3.3 Case folding 8bit U+00DF (german sharp s).
1057 # The original test case is bogus; it says \xc3\xdf
1058 ('\xc3\x9f',
1059 'ss'),
1060 # 3.4 Case folding U+0130 (turkish capital I with dot).
1061 ('\xc4\xb0',
1062 'i\xcc\x87'),
1063 # 3.5 Case folding multibyte U+0143 U+037A.
1064 ('\xc5\x83\xcd\xba',
1065 '\xc5\x84 \xce\xb9'),
1066 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1067 # XXX: skip this as it fails in UCS-2 mode
1068 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1069 # 'telc\xe2\x88\x95kg\xcf\x83'),
1070 (None, None),
1071 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1072 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1073 '\xc7\xb0 a'),
1074 # 3.8 Case folding U+1FB7 and normalization.
1075 ('\xe1\xbe\xb7',
1076 '\xe1\xbe\xb6\xce\xb9'),
1077 # 3.9 Self-reverting case folding U+01F0 and normalization.
1078 # The original test case is bogus, it says `\xc7\xf0'
1079 ('\xc7\xb0',
1080 '\xc7\xb0'),
1081 # 3.10 Self-reverting case folding U+0390 and normalization.
1082 ('\xce\x90',
1083 '\xce\x90'),
1084 # 3.11 Self-reverting case folding U+03B0 and normalization.
1085 ('\xce\xb0',
1086 '\xce\xb0'),
1087 # 3.12 Self-reverting case folding U+1E96 and normalization.
1088 ('\xe1\xba\x96',
1089 '\xe1\xba\x96'),
1090 # 3.13 Self-reverting case folding U+1F56 and normalization.
1091 ('\xe1\xbd\x96',
1092 '\xe1\xbd\x96'),
1093 # 3.14 ASCII space character U+0020.
1094 (' ',
1095 ' '),
1096 # 3.15 Non-ASCII 8bit space character U+00A0.
1097 ('\xc2\xa0',
1098 ' '),
1099 # 3.16 Non-ASCII multibyte space character U+1680.
1100 ('\xe1\x9a\x80',
1101 None),
1102 # 3.17 Non-ASCII multibyte space character U+2000.
1103 ('\xe2\x80\x80',
1104 ' '),
1105 # 3.18 Zero Width Space U+200b.
1106 ('\xe2\x80\x8b',
1107 ''),
1108 # 3.19 Non-ASCII multibyte space character U+3000.
1109 ('\xe3\x80\x80',
1110 ' '),
1111 # 3.20 ASCII control characters U+0010 U+007F.
1112 ('\x10\x7f',
1113 '\x10\x7f'),
1114 # 3.21 Non-ASCII 8bit control character U+0085.
1115 ('\xc2\x85',
1116 None),
1117 # 3.22 Non-ASCII multibyte control character U+180E.
1118 ('\xe1\xa0\x8e',
1119 None),
1120 # 3.23 Zero Width No-Break Space U+FEFF.
1121 ('\xef\xbb\xbf',
1122 ''),
1123 # 3.24 Non-ASCII control character U+1D175.
1124 ('\xf0\x9d\x85\xb5',
1125 None),
1126 # 3.25 Plane 0 private use character U+F123.
1127 ('\xef\x84\xa3',
1128 None),
1129 # 3.26 Plane 15 private use character U+F1234.
1130 ('\xf3\xb1\x88\xb4',
1131 None),
1132 # 3.27 Plane 16 private use character U+10F234.
1133 ('\xf4\x8f\x88\xb4',
1134 None),
1135 # 3.28 Non-character code point U+8FFFE.
1136 ('\xf2\x8f\xbf\xbe',
1137 None),
1138 # 3.29 Non-character code point U+10FFFF.
1139 ('\xf4\x8f\xbf\xbf',
1140 None),
1141 # 3.30 Surrogate code U+DF42.
1142 ('\xed\xbd\x82',
1143 None),
1144 # 3.31 Non-plain text character U+FFFD.
1145 ('\xef\xbf\xbd',
1146 None),
1147 # 3.32 Ideographic description character U+2FF5.
1148 ('\xe2\xbf\xb5',
1149 None),
1150 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001151 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 '\xcc\x81'),
1153 # 3.34 Left-to-right mark U+200E.
1154 ('\xe2\x80\x8e',
1155 None),
1156 # 3.35 Deprecated U+202A.
1157 ('\xe2\x80\xaa',
1158 None),
1159 # 3.36 Language tagging character U+E0001.
1160 ('\xf3\xa0\x80\x81',
1161 None),
1162 # 3.37 Language tagging character U+E0042.
1163 ('\xf3\xa0\x81\x82',
1164 None),
1165 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1166 ('foo\xd6\xbebar',
1167 None),
1168 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1169 ('foo\xef\xb5\x90bar',
1170 None),
1171 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1172 ('foo\xef\xb9\xb6bar',
1173 'foo \xd9\x8ebar'),
1174 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1175 ('\xd8\xa71',
1176 None),
1177 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1178 ('\xd8\xa71\xd8\xa8',
1179 '\xd8\xa71\xd8\xa8'),
1180 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001181 # Skip this test as we allow unassigned
1182 #('\xf3\xa0\x80\x82',
1183 # None),
1184 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 # 3.44 Larger test (shrinking).
1186 # Original test case reads \xc3\xdf
1187 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1188 '\xaa\xce\xb0\xe2\x80\x80',
1189 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1190 # 3.45 Larger test (expanding).
1191 # Original test case reads \xc3\x9f
1192 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1193 '\x80',
1194 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1195 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1196 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1197 ]
1198
1199
1200class NameprepTest(unittest.TestCase):
1201 def test_nameprep(self):
1202 from encodings.idna import nameprep
1203 for pos, (orig, prepped) in enumerate(nameprep_tests):
1204 if orig is None:
1205 # Skipped
1206 continue
1207 # The Unicode strings are given in UTF-8
1208 orig = unicode(orig, "utf-8")
1209 if prepped is None:
1210 # Input contains prohibited characters
1211 self.assertRaises(UnicodeError, nameprep, orig)
1212 else:
1213 prepped = unicode(prepped, "utf-8")
1214 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001215 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 except Exception,e:
1217 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1218
Walter Dörwald78a0be62006-04-14 18:25:39 +00001219class IDNACodecTest(unittest.TestCase):
1220 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001221 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1222 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1223 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1224 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001225
1226 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001227 self.assertEqual(u"python.org".encode("idna"), "python.org")
1228 self.assertEqual("python.org.".encode("idna"), "python.org.")
1229 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1230 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001231
Martin v. Löwis8b595142005-08-25 11:03:38 +00001232 def test_stream(self):
1233 import StringIO
1234 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1235 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001236 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001237
Walter Dörwald78a0be62006-04-14 18:25:39 +00001238 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001239 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001240 "".join(codecs.iterdecode("python.org", "idna")),
1241 u"python.org"
1242 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001243 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001244 "".join(codecs.iterdecode("python.org.", "idna")),
1245 u"python.org."
1246 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001247 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001248 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1249 u"pyth\xf6n.org."
1250 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001251 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001252 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1253 u"pyth\xf6n.org."
1254 )
1255
1256 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001257 self.assertEqual(decoder.decode("xn--xam", ), u"")
1258 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1259 self.assertEqual(decoder.decode(u"rg"), u"")
1260 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001261
1262 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001263 self.assertEqual(decoder.decode("xn--xam", ), u"")
1264 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1265 self.assertEqual(decoder.decode("rg."), u"org.")
1266 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001267
1268 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001269 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001270 "".join(codecs.iterencode(u"python.org", "idna")),
1271 "python.org"
1272 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001273 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001274 "".join(codecs.iterencode(u"python.org.", "idna")),
1275 "python.org."
1276 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001277 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001278 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1279 "xn--pythn-mua.org."
1280 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001281 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001282 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1283 "xn--pythn-mua.org."
1284 )
1285
1286 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001287 self.assertEqual(encoder.encode(u"\xe4x"), "")
1288 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1289 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001290
1291 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001292 self.assertEqual(encoder.encode(u"\xe4x"), "")
1293 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1294 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001295
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001296class CodecsModuleTest(unittest.TestCase):
1297
1298 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001299 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001300 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001301 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001302 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001303 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1304
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001305 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001306 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001307 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001308 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001309 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001310 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001311 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1312
1313 def test_register(self):
1314 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001315 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001316
1317 def test_lookup(self):
1318 self.assertRaises(TypeError, codecs.lookup)
1319 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001320 self.assertRaises(LookupError, codecs.lookup, " ")
1321
1322 def test_getencoder(self):
1323 self.assertRaises(TypeError, codecs.getencoder)
1324 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1325
1326 def test_getdecoder(self):
1327 self.assertRaises(TypeError, codecs.getdecoder)
1328 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1329
1330 def test_getreader(self):
1331 self.assertRaises(TypeError, codecs.getreader)
1332 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1333
1334 def test_getwriter(self):
1335 self.assertRaises(TypeError, codecs.getwriter)
1336 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001337
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001338 def test_lookup_issue1813(self):
1339 # Issue #1813: under Turkish locales, lookup of some codecs failed
1340 # because 'I' is lowercased as a dotless "i"
1341 oldlocale = locale.getlocale(locale.LC_CTYPE)
1342 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1343 try:
1344 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1345 except locale.Error:
1346 # Unsupported locale on this system
1347 self.skipTest('test needs Turkish locale')
1348 c = codecs.lookup('ASCII')
1349 self.assertEqual(c.name, 'ascii')
1350
Serhiy Storchaka74a651b2014-12-20 17:42:24 +02001351 def test_all(self):
1352 api = (
1353 "encode", "decode",
1354 "register", "CodecInfo", "Codec", "IncrementalEncoder",
1355 "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1356 "getencoder", "getdecoder", "getincrementalencoder",
1357 "getincrementaldecoder", "getreader", "getwriter",
1358 "register_error", "lookup_error",
1359 "strict_errors", "replace_errors", "ignore_errors",
1360 "xmlcharrefreplace_errors", "backslashreplace_errors",
1361 "open", "EncodedFile",
1362 "iterencode", "iterdecode",
1363 "BOM", "BOM_BE", "BOM_LE",
1364 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1365 "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1366 "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", # Undocumented
1367 "StreamReaderWriter", "StreamRecoder",
1368 )
1369 self.assertEqual(sorted(api), sorted(codecs.__all__))
1370 for api in codecs.__all__:
1371 getattr(codecs, api)
1372
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001373class StreamReaderTest(unittest.TestCase):
1374
1375 def setUp(self):
1376 self.reader = codecs.getreader('utf-8')
1377 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1378
1379 def test_readlines(self):
1380 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001381 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001382
Georg Brandl8f99f812006-10-29 08:39:22 +00001383class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001384
Georg Brandl8f99f812006-10-29 08:39:22 +00001385 def test_basic(self):
1386 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001387 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001388 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001389
1390 f = StringIO.StringIO()
1391 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1392 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001393 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001394
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001395class Str2StrTest(unittest.TestCase):
1396
1397 def test_read(self):
1398 sin = "\x80".encode("base64_codec")
1399 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1400 sout = reader.read()
1401 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001402 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001403
1404 def test_readline(self):
1405 sin = "\x80".encode("base64_codec")
1406 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1407 sout = reader.readline()
1408 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001409 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001410
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001411all_unicode_encodings = [
1412 "ascii",
1413 "base64_codec",
1414 "big5",
1415 "big5hkscs",
1416 "charmap",
1417 "cp037",
1418 "cp1006",
1419 "cp1026",
1420 "cp1140",
1421 "cp1250",
1422 "cp1251",
1423 "cp1252",
1424 "cp1253",
1425 "cp1254",
1426 "cp1255",
1427 "cp1256",
1428 "cp1257",
1429 "cp1258",
1430 "cp424",
1431 "cp437",
1432 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001433 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001434 "cp737",
1435 "cp775",
1436 "cp850",
1437 "cp852",
1438 "cp855",
1439 "cp856",
1440 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001441 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001442 "cp860",
1443 "cp861",
1444 "cp862",
1445 "cp863",
1446 "cp864",
1447 "cp865",
1448 "cp866",
1449 "cp869",
1450 "cp874",
1451 "cp875",
1452 "cp932",
1453 "cp949",
1454 "cp950",
1455 "euc_jis_2004",
1456 "euc_jisx0213",
1457 "euc_jp",
1458 "euc_kr",
1459 "gb18030",
1460 "gb2312",
1461 "gbk",
1462 "hex_codec",
1463 "hp_roman8",
1464 "hz",
1465 "idna",
1466 "iso2022_jp",
1467 "iso2022_jp_1",
1468 "iso2022_jp_2",
1469 "iso2022_jp_2004",
1470 "iso2022_jp_3",
1471 "iso2022_jp_ext",
1472 "iso2022_kr",
1473 "iso8859_1",
1474 "iso8859_10",
1475 "iso8859_11",
1476 "iso8859_13",
1477 "iso8859_14",
1478 "iso8859_15",
1479 "iso8859_16",
1480 "iso8859_2",
1481 "iso8859_3",
1482 "iso8859_4",
1483 "iso8859_5",
1484 "iso8859_6",
1485 "iso8859_7",
1486 "iso8859_8",
1487 "iso8859_9",
1488 "johab",
1489 "koi8_r",
1490 "koi8_u",
1491 "latin_1",
1492 "mac_cyrillic",
1493 "mac_greek",
1494 "mac_iceland",
1495 "mac_latin2",
1496 "mac_roman",
1497 "mac_turkish",
1498 "palmos",
1499 "ptcp154",
1500 "punycode",
1501 "raw_unicode_escape",
1502 "rot_13",
1503 "shift_jis",
1504 "shift_jis_2004",
1505 "shift_jisx0213",
1506 "tis_620",
1507 "unicode_escape",
1508 "unicode_internal",
1509 "utf_16",
1510 "utf_16_be",
1511 "utf_16_le",
1512 "utf_7",
1513 "utf_8",
1514]
1515
1516if hasattr(codecs, "mbcs_encode"):
1517 all_unicode_encodings.append("mbcs")
1518
1519# The following encodings work only with str, not unicode
1520all_string_encodings = [
1521 "quopri_codec",
1522 "string_escape",
1523 "uu_codec",
1524]
1525
1526# The following encoding is not tested, because it's not supposed
1527# to work:
1528# "undefined"
1529
1530# The following encodings don't work in stateful mode
1531broken_unicode_with_streams = [
1532 "base64_codec",
1533 "hex_codec",
1534 "punycode",
1535 "unicode_internal"
1536]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001537broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001538
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001539# The following encodings only support "strict" mode
1540only_strict_mode = [
1541 "idna",
1542 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001543 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001544]
1545
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001546try:
1547 import bz2
1548except ImportError:
1549 pass
1550else:
1551 all_unicode_encodings.append("bz2_codec")
1552 broken_unicode_with_streams.append("bz2_codec")
1553
1554try:
1555 import zlib
1556except ImportError:
1557 pass
1558else:
1559 all_unicode_encodings.append("zlib_codec")
1560 broken_unicode_with_streams.append("zlib_codec")
1561
1562class BasicUnicodeTest(unittest.TestCase):
1563 def test_basics(self):
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001564 s = u"abc123" # all codecs should be able to encode these
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001565 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001566 name = codecs.lookup(encoding).name
1567 if encoding.endswith("_codec"):
1568 name += "_codec"
1569 elif encoding == "latin_1":
1570 name = "latin_1"
1571 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001572 (bytes, size) = codecs.getencoder(encoding)(s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001573 self.assertEqual(size, len(s), "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001574 (chars, size) = codecs.getdecoder(encoding)(bytes)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001575 self.assertEqual(chars, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001576
1577 if encoding not in broken_unicode_with_streams:
1578 # check stream reader/writer
1579 q = Queue()
1580 writer = codecs.getwriter(encoding)(q)
1581 encodedresult = ""
1582 for c in s:
1583 writer.write(c)
1584 encodedresult += q.read()
1585 q = Queue()
1586 reader = codecs.getreader(encoding)(q)
1587 decodedresult = u""
1588 for c in encodedresult:
1589 q.write(c)
1590 decodedresult += reader.read()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001591 self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001592
Georg Brandl2c9838e2006-10-29 14:39:09 +00001593 if encoding not in broken_incremental_coders:
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001594 # check incremental decoder/encoder and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001595 try:
1596 encoder = codecs.getincrementalencoder(encoding)()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001597 except LookupError: # no IncrementalEncoder
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001598 pass
1599 else:
1600 # check incremental decoder/encoder
1601 encodedresult = ""
1602 for c in s:
1603 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001604 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001605 decoder = codecs.getincrementaldecoder(encoding)()
1606 decodedresult = u""
1607 for c in encodedresult:
1608 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001609 decodedresult += decoder.decode("", True)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001610 self.assertEqual(decodedresult, s,
1611 "encoding=%r" % encoding)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001612
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001613 # check iterencode()/iterdecode()
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001614 result = u"".join(codecs.iterdecode(
1615 codecs.iterencode(s, encoding), encoding))
1616 self.assertEqual(result, s, "encoding=%r" % encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001617
1618 # check iterencode()/iterdecode() with empty string
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001619 result = u"".join(codecs.iterdecode(
1620 codecs.iterencode(u"", encoding), encoding))
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001621 self.assertEqual(result, u"")
1622
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001623 if encoding not in only_strict_mode:
1624 # check incremental decoder/encoder with errors argument
1625 try:
1626 encoder = codecs.getincrementalencoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001627 except LookupError: # no IncrementalEncoder
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001628 pass
1629 else:
1630 encodedresult = "".join(encoder.encode(c) for c in s)
1631 decoder = codecs.getincrementaldecoder(encoding)("ignore")
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001632 decodedresult = u"".join(decoder.decode(c)
1633 for c in encodedresult)
1634 self.assertEqual(decodedresult, s,
1635 "encoding=%r" % encoding)
Tim Petersabd8a332006-11-03 02:32:46 +00001636
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001637 @test_support.cpython_only
1638 def test_basics_capi(self):
1639 from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1640 s = u"abc123" # all codecs should be able to encode these
1641 for encoding in all_unicode_encodings:
1642 if encoding not in broken_incremental_coders:
1643 # check incremental decoder/encoder and iterencode()/iterdecode()
1644 try:
1645 cencoder = codec_incrementalencoder(encoding)
1646 except LookupError: # no IncrementalEncoder
1647 pass
1648 else:
1649 # check C API
1650 encodedresult = ""
1651 for c in s:
1652 encodedresult += cencoder.encode(c)
1653 encodedresult += cencoder.encode(u"", True)
1654 cdecoder = codec_incrementaldecoder(encoding)
1655 decodedresult = u""
1656 for c in encodedresult:
1657 decodedresult += cdecoder.decode(c)
1658 decodedresult += cdecoder.decode("", True)
1659 self.assertEqual(decodedresult, s,
1660 "encoding=%r" % encoding)
1661
1662 if encoding not in only_strict_mode:
1663 # check incremental decoder/encoder with errors argument
1664 try:
1665 cencoder = codec_incrementalencoder(encoding, "ignore")
1666 except LookupError: # no IncrementalEncoder
1667 pass
1668 else:
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001669 encodedresult = "".join(cencoder.encode(c) for c in s)
Serhiy Storchaka76249ea2014-02-07 10:06:05 +02001670 cdecoder = codec_incrementaldecoder(encoding, "ignore")
1671 decodedresult = u"".join(cdecoder.decode(c)
1672 for c in encodedresult)
1673 self.assertEqual(decodedresult, s,
1674 "encoding=%r" % encoding)
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001675
Walter Dörwald729c31f2005-03-14 19:06:30 +00001676 def test_seek(self):
1677 # all codecs should be able to encode these
1678 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1679 for encoding in all_unicode_encodings:
1680 if encoding == "idna": # FIXME: See SF bug #1163178
1681 continue
1682 if encoding in broken_unicode_with_streams:
1683 continue
1684 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1685 for t in xrange(5):
1686 # Test that calling seek resets the internal codec state and buffers
1687 reader.seek(0, 0)
1688 line = reader.readline()
1689 self.assertEqual(s[:len(line)], line)
1690
Walter Dörwalde22d3392005-11-17 08:52:34 +00001691 def test_bad_decode_args(self):
1692 for encoding in all_unicode_encodings:
1693 decoder = codecs.getdecoder(encoding)
1694 self.assertRaises(TypeError, decoder)
1695 if encoding not in ("idna", "punycode"):
1696 self.assertRaises(TypeError, decoder, 42)
1697
1698 def test_bad_encode_args(self):
1699 for encoding in all_unicode_encodings:
1700 encoder = codecs.getencoder(encoding)
1701 self.assertRaises(TypeError, encoder)
1702
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001703 def test_encoding_map_type_initialized(self):
1704 from encodings import cp1140
1705 # This used to crash, we are only verifying there's no crash.
1706 table_type = type(cp1140.encoding_table)
1707 self.assertEqual(table_type, table_type)
1708
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001709class BasicStrTest(unittest.TestCase):
1710 def test_basics(self):
1711 s = "abc123"
1712 for encoding in all_string_encodings:
1713 (bytes, size) = codecs.getencoder(encoding)(s)
1714 self.assertEqual(size, len(s))
1715 (chars, size) = codecs.getdecoder(encoding)(bytes)
1716 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1717
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001718class CharmapTest(unittest.TestCase):
1719 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001720 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001721 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1722 (u"abc", 3)
1723 )
1724
Serhiy Storchaka95997452013-01-15 14:42:59 +02001725 self.assertRaises(UnicodeDecodeError,
1726 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1727 )
1728
1729 self.assertRaises(UnicodeDecodeError,
1730 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1731 )
1732
Ezio Melotti2623a372010-11-21 13:34:58 +00001733 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001734 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1735 (u"ab\ufffd", 3)
1736 )
1737
Ezio Melotti2623a372010-11-21 13:34:58 +00001738 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001739 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1740 (u"ab\ufffd", 3)
1741 )
1742
Ezio Melotti2623a372010-11-21 13:34:58 +00001743 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001744 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1745 (u"ab", 3)
1746 )
1747
Ezio Melotti2623a372010-11-21 13:34:58 +00001748 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001749 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1750 (u"ab", 3)
1751 )
1752
1753 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001754 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001755 codecs.charmap_decode(allbytes, "ignore", u""),
1756 (u"", len(allbytes))
1757 )
1758
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001759 def test_decode_with_int2str_map(self):
1760 self.assertEqual(
1761 codecs.charmap_decode("\x00\x01\x02", "strict",
1762 {0: u'a', 1: u'b', 2: u'c'}),
1763 (u"abc", 3)
1764 )
1765
1766 self.assertEqual(
1767 codecs.charmap_decode("\x00\x01\x02", "strict",
1768 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1769 (u"AaBbCc", 3)
1770 )
1771
1772 self.assertEqual(
1773 codecs.charmap_decode("\x00\x01\x02", "strict",
1774 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1775 (u"\U0010FFFFbc", 3)
1776 )
1777
1778 self.assertEqual(
1779 codecs.charmap_decode("\x00\x01\x02", "strict",
1780 {0: u'a', 1: u'b', 2: u''}),
1781 (u"ab", 3)
1782 )
1783
1784 self.assertRaises(UnicodeDecodeError,
1785 codecs.charmap_decode, "\x00\x01\x02", "strict",
1786 {0: u'a', 1: u'b'}
1787 )
1788
Serhiy Storchaka95997452013-01-15 14:42:59 +02001789 self.assertRaises(UnicodeDecodeError,
1790 codecs.charmap_decode, "\x00\x01\x02", "strict",
1791 {0: u'a', 1: u'b', 2: None}
1792 )
1793
1794 # Issue #14850
1795 self.assertRaises(UnicodeDecodeError,
1796 codecs.charmap_decode, "\x00\x01\x02", "strict",
1797 {0: u'a', 1: u'b', 2: u'\ufffe'}
1798 )
1799
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001800 self.assertEqual(
1801 codecs.charmap_decode("\x00\x01\x02", "replace",
1802 {0: u'a', 1: u'b'}),
1803 (u"ab\ufffd", 3)
1804 )
1805
1806 self.assertEqual(
1807 codecs.charmap_decode("\x00\x01\x02", "replace",
1808 {0: u'a', 1: u'b', 2: None}),
1809 (u"ab\ufffd", 3)
1810 )
1811
Serhiy Storchaka95997452013-01-15 14:42:59 +02001812 # Issue #14850
1813 self.assertEqual(
1814 codecs.charmap_decode("\x00\x01\x02", "replace",
1815 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1816 (u"ab\ufffd", 3)
1817 )
1818
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001819 self.assertEqual(
1820 codecs.charmap_decode("\x00\x01\x02", "ignore",
1821 {0: u'a', 1: u'b'}),
1822 (u"ab", 3)
1823 )
1824
1825 self.assertEqual(
1826 codecs.charmap_decode("\x00\x01\x02", "ignore",
1827 {0: u'a', 1: u'b', 2: None}),
1828 (u"ab", 3)
1829 )
1830
Serhiy Storchaka95997452013-01-15 14:42:59 +02001831 # Issue #14850
1832 self.assertEqual(
1833 codecs.charmap_decode("\x00\x01\x02", "ignore",
1834 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1835 (u"ab", 3)
1836 )
1837
1838 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001839 self.assertEqual(
1840 codecs.charmap_decode(allbytes, "ignore", {}),
1841 (u"", len(allbytes))
1842 )
1843
1844 def test_decode_with_int2int_map(self):
1845 a = ord(u'a')
1846 b = ord(u'b')
1847 c = ord(u'c')
1848
1849 self.assertEqual(
1850 codecs.charmap_decode("\x00\x01\x02", "strict",
1851 {0: a, 1: b, 2: c}),
1852 (u"abc", 3)
1853 )
1854
1855 # Issue #15379
1856 self.assertEqual(
1857 codecs.charmap_decode("\x00\x01\x02", "strict",
1858 {0: 0x10FFFF, 1: b, 2: c}),
1859 (u"\U0010FFFFbc", 3)
1860 )
1861
1862 self.assertRaises(TypeError,
1863 codecs.charmap_decode, "\x00\x01\x02", "strict",
1864 {0: 0x110000, 1: b, 2: c}
1865 )
1866
1867 self.assertRaises(UnicodeDecodeError,
1868 codecs.charmap_decode, "\x00\x01\x02", "strict",
1869 {0: a, 1: b},
1870 )
1871
Serhiy Storchaka95997452013-01-15 14:42:59 +02001872 self.assertRaises(UnicodeDecodeError,
1873 codecs.charmap_decode, "\x00\x01\x02", "strict",
1874 {0: a, 1: b, 2: 0xFFFE},
1875 )
1876
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001877 self.assertEqual(
1878 codecs.charmap_decode("\x00\x01\x02", "replace",
1879 {0: a, 1: b}),
1880 (u"ab\ufffd", 3)
1881 )
1882
1883 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001884 codecs.charmap_decode("\x00\x01\x02", "replace",
1885 {0: a, 1: b, 2: 0xFFFE}),
1886 (u"ab\ufffd", 3)
1887 )
1888
1889 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001890 codecs.charmap_decode("\x00\x01\x02", "ignore",
1891 {0: a, 1: b}),
1892 (u"ab", 3)
1893 )
1894
Serhiy Storchaka95997452013-01-15 14:42:59 +02001895 self.assertEqual(
1896 codecs.charmap_decode("\x00\x01\x02", "ignore",
1897 {0: a, 1: b, 2: 0xFFFE}),
1898 (u"ab", 3)
1899 )
1900
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001901
Georg Brandl8f99f812006-10-29 08:39:22 +00001902class WithStmtTest(unittest.TestCase):
1903 def test_encodedfile(self):
1904 f = StringIO.StringIO("\xc3\xbc")
1905 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001906 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001907
1908 def test_streamreaderwriter(self):
1909 f = StringIO.StringIO("\xc3\xbc")
1910 info = codecs.lookup("utf-8")
1911 with codecs.StreamReaderWriter(f, info.streamreader,
1912 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001913 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001914
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001915
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001916class UnicodeEscapeTest(unittest.TestCase):
1917 def test_empty(self):
1918 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1919 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1920
1921 def test_raw_encode(self):
1922 encode = codecs.unicode_escape_encode
1923 for b in range(32, 127):
1924 if b != ord('\\'):
1925 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1926
1927 def test_raw_decode(self):
1928 decode = codecs.unicode_escape_decode
1929 for b in range(256):
1930 if b != ord('\\'):
1931 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1932
1933 def test_escape_encode(self):
1934 encode = codecs.unicode_escape_encode
1935 check = coding_checker(self, encode)
1936 check(u'\t', r'\t')
1937 check(u'\n', r'\n')
1938 check(u'\r', r'\r')
1939 check(u'\\', r'\\')
1940 for b in range(32):
1941 if chr(b) not in '\t\n\r':
1942 check(unichr(b), '\\x%02x' % b)
1943 for b in range(127, 256):
1944 check(unichr(b), '\\x%02x' % b)
1945 check(u'\u20ac', r'\u20ac')
1946 check(u'\U0001d120', r'\U0001d120')
1947
1948 def test_escape_decode(self):
1949 decode = codecs.unicode_escape_decode
1950 check = coding_checker(self, decode)
1951 check("[\\\n]", u"[]")
1952 check(r'[\"]', u'["]')
1953 check(r"[\']", u"[']")
1954 check(r"[\\]", ur"[\]")
1955 check(r"[\a]", u"[\x07]")
1956 check(r"[\b]", u"[\x08]")
1957 check(r"[\t]", u"[\x09]")
1958 check(r"[\n]", u"[\x0a]")
1959 check(r"[\v]", u"[\x0b]")
1960 check(r"[\f]", u"[\x0c]")
1961 check(r"[\r]", u"[\x0d]")
1962 check(r"[\7]", u"[\x07]")
1963 check(r"[\8]", ur"[\8]")
1964 check(r"[\78]", u"[\x078]")
1965 check(r"[\41]", u"[!]")
1966 check(r"[\418]", u"[!8]")
1967 check(r"[\101]", u"[A]")
1968 check(r"[\1010]", u"[A0]")
1969 check(r"[\x41]", u"[A]")
1970 check(r"[\x410]", u"[A0]")
1971 check(r"\u20ac", u"\u20ac")
1972 check(r"\U0001d120", u"\U0001d120")
1973 for b in range(256):
1974 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1975 check('\\' + chr(b), u'\\' + unichr(b))
1976
1977 def test_decode_errors(self):
1978 decode = codecs.unicode_escape_decode
1979 for c, d in ('x', 2), ('u', 4), ('U', 4):
1980 for i in range(d):
1981 self.assertRaises(UnicodeDecodeError, decode,
1982 "\\" + c + "0"*i)
1983 self.assertRaises(UnicodeDecodeError, decode,
1984 "[\\" + c + "0"*i + "]")
1985 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1986 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1987 self.assertEqual(decode(data, "replace"),
1988 (u"[\ufffd]\ufffd", len(data)))
1989 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1990 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1991 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1992
1993
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001994class RawUnicodeEscapeTest(unittest.TestCase):
1995 def test_empty(self):
1996 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1997 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1998
1999 def test_raw_encode(self):
2000 encode = codecs.raw_unicode_escape_encode
2001 for b in range(256):
2002 self.assertEqual(encode(unichr(b)), (chr(b), 1))
2003
2004 def test_raw_decode(self):
2005 decode = codecs.raw_unicode_escape_decode
2006 for b in range(256):
2007 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2008
2009 def test_escape_encode(self):
2010 encode = codecs.raw_unicode_escape_encode
2011 check = coding_checker(self, encode)
2012 for b in range(256):
2013 if chr(b) not in 'uU':
2014 check(u'\\' + unichr(b), '\\' + chr(b))
2015 check(u'\u20ac', r'\u20ac')
2016 check(u'\U0001d120', r'\U0001d120')
2017
2018 def test_escape_decode(self):
2019 decode = codecs.raw_unicode_escape_decode
2020 check = coding_checker(self, decode)
2021 for b in range(256):
2022 if chr(b) not in 'uU':
2023 check('\\' + chr(b), u'\\' + unichr(b))
2024 check(r"\u20ac", u"\u20ac")
2025 check(r"\U0001d120", u"\U0001d120")
2026
2027 def test_decode_errors(self):
2028 decode = codecs.raw_unicode_escape_decode
2029 for c, d in ('u', 4), ('U', 4):
2030 for i in range(d):
2031 self.assertRaises(UnicodeDecodeError, decode,
2032 "\\" + c + "0"*i)
2033 self.assertRaises(UnicodeDecodeError, decode,
2034 "[\\" + c + "0"*i + "]")
2035 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2036 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2037 self.assertEqual(decode(data, "replace"),
2038 (u"[\ufffd]\ufffd", len(data)))
2039 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2040 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2041 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2042
2043
Victor Stinner262be5e2010-05-22 02:11:07 +00002044class BomTest(unittest.TestCase):
2045 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002046 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002047 tests = ("utf-16",
2048 "utf-16-le",
2049 "utf-16-be",
2050 "utf-32",
2051 "utf-32-le",
2052 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002053 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002054 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002055 # Check if the BOM is written only once
2056 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002057 f.write(data)
2058 f.write(data)
2059 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002060 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002061 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002062 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002063
Victor Stinner7df55da2010-05-22 13:37:56 +00002064 # Check that the BOM is written after a seek(0)
2065 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2066 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002067 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002068 f.seek(0)
2069 f.write(data)
2070 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002071 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002072
2073 # (StreamWriter) Check that the BOM is written after a seek(0)
2074 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2075 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002076 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002077 f.writer.seek(0)
2078 f.writer.write(data)
2079 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002080 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002081
2082 # Check that the BOM is not written after a seek() at a position
2083 # different than the start
2084 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2085 f.write(data)
2086 f.seek(f.tell())
2087 f.write(data)
2088 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002089 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002090
2091 # (StreamWriter) Check that the BOM is not written after a seek()
2092 # at a position different than the start
2093 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2094 f.writer.write(data)
2095 f.writer.seek(f.writer.tell())
2096 f.writer.write(data)
2097 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002098 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002099
Victor Stinner262be5e2010-05-22 02:11:07 +00002100
Fred Drake2e2be372001-09-20 21:33:42 +00002101def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002102 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002103 UTF32Test,
2104 UTF32LETest,
2105 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002106 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002107 UTF16LETest,
2108 UTF16BETest,
2109 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002110 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002111 UTF7Test,
2112 UTF16ExTest,
2113 ReadBufferTest,
2114 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002115 EscapeDecodeTest,
2116 RecodingTest,
2117 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002118 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002119 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002120 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002121 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002122 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002123 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002124 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002125 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002126 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002127 CharmapTest,
2128 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002129 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002130 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002131 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002132 )
Fred Drake2e2be372001-09-20 21:33:42 +00002133
Serhiy Storchakab4f3d802014-11-07 14:07:43 +02002134 def test_uu_invalid(self):
2135 # Missing "begin" line
2136 self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2137
Fred Drake2e2be372001-09-20 21:33:42 +00002138
2139if __name__ == "__main__":
2140 test_main()