blob: 7a4c70b964f785bee318088ecf3984192cbd07e4 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Antoine Pitrou4cfae022011-07-24 02:51:01 +02004import locale
Walter Dörwald9ae019b2006-03-18 14:22:26 +00005import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00006
Serhiy Storchakac8e58122013-01-29 10:20:34 +02007def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
Walter Dörwald69652032004-09-07 20:24:22 +000012class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
Walter Dörwalde57d7b12004-12-21 22:24:00 +000032class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000034 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000035 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000036 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000039 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000040 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000041 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000042 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
Walter Dörwaldabb02e52006-03-15 11:35:15 +000050 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000060 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000061 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
Walter Dörwaldca199432006-03-06 22:39:12 +000082 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000083 reader = getreader(input)
84 lines = []
85 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000086 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000087 if not line:
88 break
89 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000090 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000091
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000093 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000099
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200100 lineends = ("\n", "\r\n", "\r", u"\u2028")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000101 # Test long lines (multiple calls to read() in readline())
102 vw = []
103 vwo = []
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200104 for (i, lineend) in enumerate(lineends):
105 vw.append((i*200+200)*u"\u3042" + lineend)
106 vwo.append((i*200+200)*u"\u3042")
107 self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108 self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000109
110 # Test lines where the first read might end with \r, so the
111 # reader has to look ahead whether this is a lone \r or a \r\n
112 for size in xrange(80):
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200113 for lineend in lineends:
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000114 s = 10*(size*u"a" + lineend + u"xxx\n")
115 reader = getreader(s)
116 for i in xrange(10):
117 self.assertEqual(
118 reader.readline(keepends=True),
119 size*u"a" + lineend,
120 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200121 self.assertEqual(
122 reader.readline(keepends=True),
123 "xxx\n",
124 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000125 reader = getreader(s)
126 for i in xrange(10):
127 self.assertEqual(
128 reader.readline(keepends=False),
129 size*u"a",
130 )
Serhiy Storchaka6a036792014-02-06 09:26:32 +0200131 self.assertEqual(
132 reader.readline(keepends=False),
133 "xxx",
134 )
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000135
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200136 def test_mixed_readline_and_read(self):
137 lines = ["Humpty Dumpty sat on a wall,\n",
138 "Humpty Dumpty had a great fall.\r\n",
139 "All the king's horses and all the king's men\r",
140 "Couldn't put Humpty together again."]
141 data = ''.join(lines)
142 def getreader():
143 stream = StringIO.StringIO(data.encode(self.encoding))
144 return codecs.getreader(self.encoding)(stream)
145
146 # Issue #8260: Test readline() followed by read()
147 f = getreader()
148 self.assertEqual(f.readline(), lines[0])
149 self.assertEqual(f.read(), ''.join(lines[1:]))
150 self.assertEqual(f.read(), '')
151
152 # Issue #16636: Test readline() followed by readlines()
153 f = getreader()
154 self.assertEqual(f.readline(), lines[0])
155 self.assertEqual(f.readlines(), lines[1:])
156 self.assertEqual(f.read(), '')
157
158 # Test read() followed by read()
159 f = getreader()
160 self.assertEqual(f.read(size=40, chars=5), data[:5])
161 self.assertEqual(f.read(), data[5:])
162 self.assertEqual(f.read(), '')
163
164 # Issue #12446: Test read() followed by readlines()
165 f = getreader()
166 self.assertEqual(f.read(size=40, chars=5), data[:5])
167 self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
168 self.assertEqual(f.read(), '')
169
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000170 def test_bug1175396(self):
171 s = [
172 '<%!--===================================================\r\n',
173 ' BLOG index page: show recent articles,\r\n',
174 ' today\'s articles, or articles of a specific date.\r\n',
175 '========================================================--%>\r\n',
176 '<%@inputencoding="ISO-8859-1"%>\r\n',
177 '<%@pagetemplate=TEMPLATE.y%>\r\n',
178 '<%@import=import frog.util, frog%>\r\n',
179 '<%@import=import frog.objects%>\r\n',
180 '<%@import=from frog.storageerrors import StorageError%>\r\n',
181 '<%\r\n',
182 '\r\n',
183 'import logging\r\n',
184 'log=logging.getLogger("Snakelets.logger")\r\n',
185 '\r\n',
186 '\r\n',
187 'user=self.SessionCtx.user\r\n',
188 'storageEngine=self.SessionCtx.storageEngine\r\n',
189 '\r\n',
190 '\r\n',
191 'def readArticlesFromDate(date, count=None):\r\n',
192 ' entryids=storageEngine.listBlogEntries(date)\r\n',
193 ' entryids.reverse() # descending\r\n',
194 ' if count:\r\n',
195 ' entryids=entryids[:count]\r\n',
196 ' try:\r\n',
197 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
198 ' except StorageError,x:\r\n',
199 ' log.error("Error loading articles: "+str(x))\r\n',
200 ' self.abort("cannot load articles")\r\n',
201 '\r\n',
202 'showdate=None\r\n',
203 '\r\n',
204 'arg=self.Request.getArg()\r\n',
205 'if arg=="today":\r\n',
206 ' #-------------------- TODAY\'S ARTICLES\r\n',
207 ' self.write("<h2>Today\'s articles</h2>")\r\n',
208 ' showdate = frog.util.isodatestr() \r\n',
209 ' entries = readArticlesFromDate(showdate)\r\n',
210 'elif arg=="active":\r\n',
211 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
212 ' self.Yredirect("active.y")\r\n',
213 'elif arg=="login":\r\n',
214 ' #-------------------- LOGIN PAGE redirect\r\n',
215 ' self.Yredirect("login.y")\r\n',
216 'elif arg=="date":\r\n',
217 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
218 ' showdate = self.Request.getParameter("date")\r\n',
219 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
220 ' entries = readArticlesFromDate(showdate)\r\n',
221 'else:\r\n',
222 ' #-------------------- RECENT ARTICLES\r\n',
223 ' self.write("<h2>Recent articles</h2>")\r\n',
224 ' dates=storageEngine.listBlogEntryDates()\r\n',
225 ' if dates:\r\n',
226 ' entries=[]\r\n',
227 ' SHOWAMOUNT=10\r\n',
228 ' for showdate in dates:\r\n',
229 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
230 ' if len(entries)>=SHOWAMOUNT:\r\n',
231 ' break\r\n',
232 ' \r\n',
233 ]
234 stream = StringIO.StringIO("".join(s).encode(self.encoding))
235 reader = codecs.getreader(self.encoding)(stream)
236 for (i, line) in enumerate(reader):
237 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000238
239 def test_readlinequeue(self):
240 q = Queue()
241 writer = codecs.getwriter(self.encoding)(q)
242 reader = codecs.getreader(self.encoding)(q)
243
244 # No lineends
245 writer.write(u"foo\r")
246 self.assertEqual(reader.readline(keepends=False), u"foo")
247 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000248 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000249 self.assertEqual(reader.readline(keepends=False), u"bar")
250 writer.write(u"baz")
251 self.assertEqual(reader.readline(keepends=False), u"baz")
252 self.assertEqual(reader.readline(keepends=False), u"")
253
254 # Lineends
255 writer.write(u"foo\r")
256 self.assertEqual(reader.readline(keepends=True), u"foo\r")
257 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000258 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000259 self.assertEqual(reader.readline(keepends=True), u"bar\r")
260 writer.write(u"baz")
261 self.assertEqual(reader.readline(keepends=True), u"baz")
262 self.assertEqual(reader.readline(keepends=True), u"")
263 writer.write(u"foo\r\n")
264 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
265
Walter Dörwald9fa09462005-01-10 12:01:39 +0000266 def test_bug1098990_a(self):
267 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
268 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
269 s3 = u"next line.\r\n"
270
271 s = (s1+s2+s3).encode(self.encoding)
272 stream = StringIO.StringIO(s)
273 reader = codecs.getreader(self.encoding)(stream)
274 self.assertEqual(reader.readline(), s1)
275 self.assertEqual(reader.readline(), s2)
276 self.assertEqual(reader.readline(), s3)
277 self.assertEqual(reader.readline(), u"")
278
279 def test_bug1098990_b(self):
280 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
281 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
282 s3 = u"stillokay:bbbbxx\r\n"
283 s4 = u"broken!!!!badbad\r\n"
284 s5 = u"againokay.\r\n"
285
286 s = (s1+s2+s3+s4+s5).encode(self.encoding)
287 stream = StringIO.StringIO(s)
288 reader = codecs.getreader(self.encoding)(stream)
289 self.assertEqual(reader.readline(), s1)
290 self.assertEqual(reader.readline(), s2)
291 self.assertEqual(reader.readline(), s3)
292 self.assertEqual(reader.readline(), s4)
293 self.assertEqual(reader.readline(), s5)
294 self.assertEqual(reader.readline(), u"")
295
Walter Dörwald6e390802007-08-17 16:41:28 +0000296class UTF32Test(ReadTest):
297 encoding = "utf-32"
298
299 spamle = ('\xff\xfe\x00\x00'
300 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
301 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
302 spambe = ('\x00\x00\xfe\xff'
303 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
304 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
305
306 def test_only_one_bom(self):
307 _,_,reader,writer = codecs.lookup(self.encoding)
308 # encode some stream
309 s = StringIO.StringIO()
310 f = writer(s)
311 f.write(u"spam")
312 f.write(u"spam")
313 d = s.getvalue()
314 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000315 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000316 # try to read it back
317 s = StringIO.StringIO(d)
318 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000319 self.assertEqual(f.read(), u"spamspam")
Walter Dörwald6e390802007-08-17 16:41:28 +0000320
321 def test_badbom(self):
322 s = StringIO.StringIO(4*"\xff")
323 f = codecs.getreader(self.encoding)(s)
324 self.assertRaises(UnicodeError, f.read)
325
326 s = StringIO.StringIO(8*"\xff")
327 f = codecs.getreader(self.encoding)(s)
328 self.assertRaises(UnicodeError, f.read)
329
330 def test_partial(self):
331 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200332 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000333 [
334 u"", # first byte of BOM read
335 u"", # second byte of BOM read
336 u"", # third byte of BOM read
337 u"", # fourth byte of BOM read => byteorder known
338 u"",
339 u"",
340 u"",
341 u"\x00",
342 u"\x00",
343 u"\x00",
344 u"\x00",
345 u"\x00\xff",
346 u"\x00\xff",
347 u"\x00\xff",
348 u"\x00\xff",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100",
351 u"\x00\xff\u0100",
352 u"\x00\xff\u0100",
353 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200354 u"\x00\xff\u0100\uffff",
355 u"\x00\xff\u0100\uffff",
356 u"\x00\xff\u0100\uffff",
357 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000358 ]
359 )
360
Georg Brandle9741f32009-09-17 11:28:09 +0000361 def test_handlers(self):
362 self.assertEqual((u'\ufffd', 1),
363 codecs.utf_32_decode('\x01', 'replace', True))
364 self.assertEqual((u'', 1),
365 codecs.utf_32_decode('\x01', 'ignore', True))
366
Walter Dörwald6e390802007-08-17 16:41:28 +0000367 def test_errors(self):
368 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
369 "\xff", "strict", True)
370
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000371 def test_issue8941(self):
372 # Issue #8941: insufficient result allocation when decoding into
373 # surrogate pairs on UCS-2 builds.
374 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
375 self.assertEqual(u'\U00010000' * 1024,
376 codecs.utf_32_decode(encoded_le)[0])
377 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
378 self.assertEqual(u'\U00010000' * 1024,
379 codecs.utf_32_decode(encoded_be)[0])
380
Walter Dörwald6e390802007-08-17 16:41:28 +0000381class UTF32LETest(ReadTest):
382 encoding = "utf-32-le"
383
384 def test_partial(self):
385 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200386 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000387 [
388 u"",
389 u"",
390 u"",
391 u"\x00",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00\xff",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff\u0100",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000408 ]
409 )
410
411 def test_simple(self):
412 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
413
414 def test_errors(self):
415 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
416 "\xff", "strict", True)
417
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000418 def test_issue8941(self):
419 # Issue #8941: insufficient result allocation when decoding into
420 # surrogate pairs on UCS-2 builds.
421 encoded = '\x00\x00\x01\x00' * 1024
422 self.assertEqual(u'\U00010000' * 1024,
423 codecs.utf_32_le_decode(encoded)[0])
424
Walter Dörwald6e390802007-08-17 16:41:28 +0000425class UTF32BETest(ReadTest):
426 encoding = "utf-32-be"
427
428 def test_partial(self):
429 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200430 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000431 [
432 u"",
433 u"",
434 u"",
435 u"\x00",
436 u"\x00",
437 u"\x00",
438 u"\x00",
439 u"\x00\xff",
440 u"\x00\xff",
441 u"\x00\xff",
442 u"\x00\xff",
443 u"\x00\xff\u0100",
444 u"\x00\xff\u0100",
445 u"\x00\xff\u0100",
446 u"\x00\xff\u0100",
447 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200448 u"\x00\xff\u0100\uffff",
449 u"\x00\xff\u0100\uffff",
450 u"\x00\xff\u0100\uffff",
451 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald6e390802007-08-17 16:41:28 +0000452 ]
453 )
454
455 def test_simple(self):
456 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
457
458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
460 "\xff", "strict", True)
461
Antoine Pitroucca3a3f2010-06-11 21:42:26 +0000462 def test_issue8941(self):
463 # Issue #8941: insufficient result allocation when decoding into
464 # surrogate pairs on UCS-2 builds.
465 encoded = '\x00\x01\x00\x00' * 1024
466 self.assertEqual(u'\U00010000' * 1024,
467 codecs.utf_32_be_decode(encoded)[0])
468
469
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000470class UTF16Test(ReadTest):
471 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000472
473 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
474 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
475
476 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000478 # encode some stream
479 s = StringIO.StringIO()
480 f = writer(s)
481 f.write(u"spam")
482 f.write(u"spam")
483 d = s.getvalue()
484 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000485 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000486 # try to read it back
487 s = StringIO.StringIO(d)
488 f = reader(s)
Ezio Melotti2623a372010-11-21 13:34:58 +0000489 self.assertEqual(f.read(), u"spamspam")
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000490
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000491 def test_badbom(self):
492 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000493 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000494 self.assertRaises(UnicodeError, f.read)
495
496 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000497 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000498 self.assertRaises(UnicodeError, f.read)
499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def test_partial(self):
501 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200502 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000503 [
504 u"", # first byte of BOM read
505 u"", # second byte of BOM read => byteorder known
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000518 ]
519 )
520
Georg Brandle9741f32009-09-17 11:28:09 +0000521 def test_handlers(self):
522 self.assertEqual((u'\ufffd', 1),
523 codecs.utf_16_decode('\x01', 'replace', True))
524 self.assertEqual((u'', 1),
525 codecs.utf_16_decode('\x01', 'ignore', True))
526
Walter Dörwalde22d3392005-11-17 08:52:34 +0000527 def test_errors(self):
528 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
529
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000530 def test_bug691291(self):
531 # Files are always opened in binary mode, even if no binary mode was
532 # specified. This means that no automatic conversion of '\n' is done
533 # on reading and writing.
534 s1 = u'Hello\r\nworld\r\n'
535
536 s = s1.encode(self.encoding)
Victor Stinner6c603c42011-05-23 16:19:31 +0200537 self.addCleanup(test_support.unlink, test_support.TESTFN)
538 with open(test_support.TESTFN, 'wb') as fp:
539 fp.write(s)
540 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
541 self.assertEqual(reader.read(), s1)
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543class UTF16LETest(ReadTest):
544 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000545
546 def test_partial(self):
547 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200548 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000549 [
550 u"",
551 u"\x00",
552 u"\x00",
553 u"\x00\xff",
554 u"\x00\xff",
555 u"\x00\xff\u0100",
556 u"\x00\xff\u0100",
557 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200558 u"\x00\xff\u0100\uffff",
559 u"\x00\xff\u0100\uffff",
560 u"\x00\xff\u0100\uffff",
561 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000562 ]
563 )
564
Walter Dörwalde22d3392005-11-17 08:52:34 +0000565 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200566 tests = [
567 (b'\xff', u'\ufffd'),
568 (b'A\x00Z', u'A\ufffd'),
569 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
570 (b'\x00\xd8', u'\ufffd'),
571 (b'\x00\xd8A', u'\ufffd'),
572 (b'\x00\xd8A\x00', u'\ufffdA'),
573 (b'\x00\xdcA\x00', u'\ufffdA'),
574 ]
575 for raw, expected in tests:
576 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
577 raw, 'strict', True)
578 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000579
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580class UTF16BETest(ReadTest):
581 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000582
583 def test_partial(self):
584 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200585 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000586 [
587 u"",
588 u"\x00",
589 u"\x00",
590 u"\x00\xff",
591 u"\x00\xff",
592 u"\x00\xff\u0100",
593 u"\x00\xff\u0100",
594 u"\x00\xff\u0100\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200595 u"\x00\xff\u0100\uffff",
596 u"\x00\xff\u0100\uffff",
597 u"\x00\xff\u0100\uffff",
598 u"\x00\xff\u0100\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000599 ]
600 )
601
Walter Dörwalde22d3392005-11-17 08:52:34 +0000602 def test_errors(self):
Antoine Pitrou715a63b2012-07-21 00:52:06 +0200603 tests = [
604 (b'\xff', u'\ufffd'),
605 (b'\x00A\xff', u'A\ufffd'),
606 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
607 (b'\xd8\x00', u'\ufffd'),
608 (b'\xd8\x00\xdc', u'\ufffd'),
609 (b'\xd8\x00\x00A', u'\ufffdA'),
610 (b'\xdc\x00\x00A', u'\ufffdA'),
611 ]
612 for raw, expected in tests:
613 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
614 raw, 'strict', True)
615 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
Walter Dörwalde22d3392005-11-17 08:52:34 +0000616
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000617class UTF8Test(ReadTest):
618 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000619
620 def test_partial(self):
621 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200622 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000623 [
624 u"\x00",
625 u"\x00",
626 u"\x00\xff",
627 u"\x00\xff",
628 u"\x00\xff\u07ff",
629 u"\x00\xff\u07ff",
630 u"\x00\xff\u07ff",
631 u"\x00\xff\u07ff\u0800",
632 u"\x00\xff\u07ff\u0800",
633 u"\x00\xff\u07ff\u0800",
634 u"\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200635 u"\x00\xff\u07ff\u0800\uffff",
636 u"\x00\xff\u07ff\u0800\uffff",
637 u"\x00\xff\u07ff\u0800\uffff",
638 u"\x00\xff\u07ff\u0800\uffff\U00010000",
Walter Dörwald69652032004-09-07 20:24:22 +0000639 ]
640 )
641
Walter Dörwalde22d3392005-11-17 08:52:34 +0000642class UTF7Test(ReadTest):
643 encoding = "utf-7"
644
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000645 def test_partial(self):
646 self.check_partial(
647 u"a+-b",
648 [
649 u"a",
650 u"a",
651 u"a+",
652 u"a+-",
653 u"a+-b",
654 ]
655 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000656
Serhiy Storchakaf1056722013-10-19 20:37:49 +0300657 def test_errors(self):
658 tests = [
659 ('a\xffb', u'a\ufffdb'),
660 ('a+IK', u'a\ufffd'),
661 ('a+IK-b', u'a\ufffdb'),
662 ('a+IK,b', u'a\ufffdb'),
663 ('a+IKx', u'a\u20ac\ufffd'),
664 ('a+IKx-b', u'a\u20ac\ufffdb'),
665 ('a+IKwgr', u'a\u20ac\ufffd'),
666 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
667 ('a+IKwgr,', u'a\u20ac\ufffd'),
668 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
669 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
670 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
671 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
672 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
673 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
674 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
675 ]
676 for raw, expected in tests:
677 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
678 raw, 'strict', True)
679 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
680
681 def test_nonbmp(self):
682 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
683 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
684 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
685
Walter Dörwalde22d3392005-11-17 08:52:34 +0000686class UTF16ExTest(unittest.TestCase):
687
688 def test_errors(self):
689 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
690
691 def test_bad_args(self):
692 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
693
694class ReadBufferTest(unittest.TestCase):
695
696 def test_array(self):
697 import array
698 self.assertEqual(
699 codecs.readbuffer_encode(array.array("c", "spam")),
700 ("spam", 4)
701 )
702
703 def test_empty(self):
704 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
705
706 def test_bad_args(self):
707 self.assertRaises(TypeError, codecs.readbuffer_encode)
708 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
709
710class CharBufferTest(unittest.TestCase):
711
712 def test_string(self):
713 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
714
715 def test_empty(self):
716 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
717
718 def test_bad_args(self):
719 self.assertRaises(TypeError, codecs.charbuffer_encode)
720 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
721
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000722class UTF8SigTest(ReadTest):
723 encoding = "utf-8-sig"
724
725 def test_partial(self):
726 self.check_partial(
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200727 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000728 [
729 u"",
730 u"",
731 u"", # First BOM has been read and skipped
732 u"",
733 u"",
734 u"\ufeff", # Second BOM has been read and emitted
735 u"\ufeff\x00", # "\x00" read and emitted
736 u"\ufeff\x00", # First byte of encoded u"\xff" read
737 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
738 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
739 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
740 u"\ufeff\x00\xff\u07ff",
741 u"\ufeff\x00\xff\u07ff",
742 u"\ufeff\x00\xff\u07ff\u0800",
743 u"\ufeff\x00\xff\u07ff\u0800",
744 u"\ufeff\x00\xff\u07ff\u0800",
745 u"\ufeff\x00\xff\u07ff\u0800\uffff",
Serhiy Storchakac4b82c02013-01-08 23:12:00 +0200746 u"\ufeff\x00\xff\u07ff\u0800\uffff",
747 u"\ufeff\x00\xff\u07ff\u0800\uffff",
748 u"\ufeff\x00\xff\u07ff\u0800\uffff",
749 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000750 ]
751 )
752
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000753 def test_bug1601501(self):
754 # SF bug #1601501: check that the codec works with a buffer
755 unicode("\xef\xbb\xbf", "utf-8-sig")
756
Walter Dörwald42348272007-04-12 10:35:00 +0000757 def test_bom(self):
758 d = codecs.getincrementaldecoder("utf-8-sig")()
759 s = u"spam"
760 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
761
Walter Dörwald183744d2007-11-19 12:41:10 +0000762 def test_stream_bom(self):
763 unistring = u"ABC\u00A1\u2200XYZ"
764 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
765
766 reader = codecs.getreader("utf-8-sig")
767 for sizehint in [None] + range(1, 11) + \
768 [64, 128, 256, 512, 1024]:
769 istream = reader(StringIO.StringIO(bytestring))
770 ostream = StringIO.StringIO()
771 while 1:
772 if sizehint is not None:
773 data = istream.read(sizehint)
774 else:
775 data = istream.read()
776
777 if not data:
778 break
779 ostream.write(data)
780
781 got = ostream.getvalue()
782 self.assertEqual(got, unistring)
783
784 def test_stream_bare(self):
785 unistring = u"ABC\u00A1\u2200XYZ"
786 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
787
788 reader = codecs.getreader("utf-8-sig")
789 for sizehint in [None] + range(1, 11) + \
790 [64, 128, 256, 512, 1024]:
791 istream = reader(StringIO.StringIO(bytestring))
792 ostream = StringIO.StringIO()
793 while 1:
794 if sizehint is not None:
795 data = istream.read(sizehint)
796 else:
797 data = istream.read()
798
799 if not data:
800 break
801 ostream.write(data)
802
803 got = ostream.getvalue()
804 self.assertEqual(got, unistring)
805
Walter Dörwald8709a422002-09-03 13:53:40 +0000806class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000807 def test_empty(self):
Ezio Melotti2623a372010-11-21 13:34:58 +0000808 self.assertEqual(codecs.escape_decode(""), ("", 0))
Walter Dörwald8709a422002-09-03 13:53:40 +0000809
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200810 def test_raw(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200811 decode = codecs.escape_decode
812 for b in range(256):
813 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200814 if b != '\\':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200815 self.assertEqual(decode(b + '0'), (b + '0', 2))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200816
817 def test_escape(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200818 decode = codecs.escape_decode
819 check = coding_checker(self, decode)
820 check(b"[\\\n]", b"[]")
821 check(br'[\"]', b'["]')
822 check(br"[\']", b"[']")
823 check(br"[\\]", br"[\]")
824 check(br"[\a]", b"[\x07]")
825 check(br"[\b]", b"[\x08]")
826 check(br"[\t]", b"[\x09]")
827 check(br"[\n]", b"[\x0a]")
828 check(br"[\v]", b"[\x0b]")
829 check(br"[\f]", b"[\x0c]")
830 check(br"[\r]", b"[\x0d]")
831 check(br"[\7]", b"[\x07]")
832 check(br"[\8]", br"[\8]")
833 check(br"[\78]", b"[\x078]")
834 check(br"[\41]", b"[!]")
835 check(br"[\418]", b"[!8]")
836 check(br"[\101]", b"[A]")
837 check(br"[\1010]", b"[A0]")
838 check(br"[\501]", b"[A]")
839 check(br"[\x41]", b"[A]")
840 check(br"[\X41]", br"[\X41]")
841 check(br"[\x410]", b"[A0]")
842 for b in range(256):
843 b = chr(b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200844 if b not in '\n"\'\\abtnvfr01234567x':
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200845 check('\\' + b, '\\' + b)
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200846
847 def test_errors(self):
Serhiy Storchaka7277f9d2013-01-29 11:06:28 +0200848 decode = codecs.escape_decode
849 self.assertRaises(ValueError, decode, br"\x")
850 self.assertRaises(ValueError, decode, br"[\x]")
851 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
852 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
853 self.assertRaises(ValueError, decode, br"\x0")
854 self.assertRaises(ValueError, decode, br"[\x0]")
855 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
856 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
Serhiy Storchaka01b3a082013-01-25 23:30:50 +0200857
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000858class RecodingTest(unittest.TestCase):
859 def test_recoding(self):
860 f = StringIO.StringIO()
861 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
862 f2.write(u"a")
863 f2.close()
864 # Python used to crash on this at exit because of a refcount
865 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000866
Martin v. Löwis2548c732003-04-18 10:39:54 +0000867# From RFC 3492
868punycode_testcases = [
869 # A Arabic (Egyptian):
870 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
871 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
872 "egbpdaj6bu4bxfgehfvwxn"),
873 # B Chinese (simplified):
874 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
875 "ihqwcrb4cv8a8dqg056pqjye"),
876 # C Chinese (traditional):
877 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
878 "ihqwctvzc91f659drss3x8bo0yb"),
879 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
880 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
881 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
882 u"\u0065\u0073\u006B\u0079",
883 "Proprostnemluvesky-uyb24dma41a"),
884 # E Hebrew:
885 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
886 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
887 u"\u05D1\u05E8\u05D9\u05EA",
888 "4dbcagdahymbxekheh6e0a7fei0b"),
889 # F Hindi (Devanagari):
890 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
891 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
892 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
893 u"\u0939\u0948\u0902",
894 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
895
896 #(G) Japanese (kanji and hiragana):
897 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
898 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
899 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
900
901 # (H) Korean (Hangul syllables):
902 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
903 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
904 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
905 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
906 "psd879ccm6fea98c"),
907
908 # (I) Russian (Cyrillic):
909 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
910 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
911 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
912 u"\u0438",
913 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
914
915 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
916 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
917 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
918 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
919 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
920 u"\u0061\u00F1\u006F\u006C",
921 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
922
923 # (K) Vietnamese:
924 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
925 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
926 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
927 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
928 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
929 u"\u0056\u0069\u1EC7\u0074",
930 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
931
Martin v. Löwis2548c732003-04-18 10:39:54 +0000932 #(L) 3<nen>B<gumi><kinpachi><sensei>
933 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
934 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000935
Martin v. Löwis2548c732003-04-18 10:39:54 +0000936 # (M) <amuro><namie>-with-SUPER-MONKEYS
937 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
938 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
939 u"\u004F\u004E\u004B\u0045\u0059\u0053",
940 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
941
942 # (N) Hello-Another-Way-<sorezore><no><basho>
943 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
944 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
945 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
946 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
947
948 # (O) <hitotsu><yane><no><shita>2
949 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
950 "2-u9tlzr9756bt3uc0v"),
951
952 # (P) Maji<de>Koi<suru>5<byou><mae>
953 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
954 u"\u308B\u0035\u79D2\u524D",
955 "MajiKoi5-783gue6qz075azm5e"),
956
957 # (Q) <pafii>de<runba>
958 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
959 "de-jg4avhby1noc0d"),
960
961 # (R) <sono><supiido><de>
962 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
963 "d9juau41awczczp"),
964
965 # (S) -> $1.00 <-
966 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
967 u"\u003C\u002D",
968 "-> $1.00 <--")
969 ]
970
971for i in punycode_testcases:
972 if len(i)!=2:
973 print repr(i)
974
975class PunycodeTest(unittest.TestCase):
976 def test_encode(self):
977 for uni, puny in punycode_testcases:
978 # Need to convert both strings to lower case, since
979 # some of the extended encodings use upper case, but our
980 # code produces only lower case. Converting just puny to
981 # lower is also insufficient, since some of the input characters
982 # are upper case.
Ezio Melotti2623a372010-11-21 13:34:58 +0000983 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
Martin v. Löwis2548c732003-04-18 10:39:54 +0000984
985 def test_decode(self):
986 for uni, puny in punycode_testcases:
Ezio Melotti2623a372010-11-21 13:34:58 +0000987 self.assertEqual(uni, puny.decode("punycode"))
Martin v. Löwis2548c732003-04-18 10:39:54 +0000988
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000989class UnicodeInternalTest(unittest.TestCase):
990 def test_bug1251300(self):
991 # Decoding with unicode_internal used to not correctly handle "code
992 # points" above 0x10ffff on UCS-4 builds.
993 if sys.maxunicode > 0xffff:
994 ok = [
995 ("\x00\x10\xff\xff", u"\U0010ffff"),
996 ("\x00\x00\x01\x01", u"\U00000101"),
997 ("", u""),
998 ]
999 not_ok = [
1000 "\x7f\xff\xff\xff",
1001 "\x80\x00\x00\x00",
1002 "\x81\x00\x00\x00",
1003 "\x00",
1004 "\x00\x00\x00\x00\x00",
1005 ]
1006 for internal, uni in ok:
1007 if sys.byteorder == "little":
1008 internal = "".join(reversed(internal))
Ezio Melotti2623a372010-11-21 13:34:58 +00001009 self.assertEqual(uni, internal.decode("unicode_internal"))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001010 for internal in not_ok:
1011 if sys.byteorder == "little":
1012 internal = "".join(reversed(internal))
1013 self.assertRaises(UnicodeDecodeError, internal.decode,
1014 "unicode_internal")
1015
1016 def test_decode_error_attributes(self):
1017 if sys.maxunicode > 0xffff:
1018 try:
1019 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1020 except UnicodeDecodeError, ex:
Ezio Melotti2623a372010-11-21 13:34:58 +00001021 self.assertEqual("unicode_internal", ex.encoding)
1022 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1023 self.assertEqual(4, ex.start)
1024 self.assertEqual(8, ex.end)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001025 else:
1026 self.fail()
1027
1028 def test_decode_callback(self):
1029 if sys.maxunicode > 0xffff:
1030 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1031 decoder = codecs.getdecoder("unicode_internal")
1032 ab = u"ab".encode("unicode_internal")
1033 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1034 "UnicodeInternalTest")
Ezio Melotti2623a372010-11-21 13:34:58 +00001035 self.assertEqual((u"ab", 12), ignored)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001036
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001037 def test_encode_length(self):
1038 # Issue 3739
1039 encoder = codecs.getencoder("unicode_internal")
Ezio Melotti2623a372010-11-21 13:34:58 +00001040 self.assertEqual(encoder(u"a")[1], 1)
1041 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001042
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001043 encoder = codecs.getencoder("string-escape")
Ezio Melotti2623a372010-11-21 13:34:58 +00001044 self.assertEqual(encoder(r'\x00')[1], 4)
Philip Jenvey034b0ac2010-04-05 02:51:51 +00001045
Martin v. Löwis2548c732003-04-18 10:39:54 +00001046# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1047nameprep_tests = [
1048 # 3.1 Map to nothing.
1049 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1050 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1051 '\xb8\x8f\xef\xbb\xbf',
1052 'foobarbaz'),
1053 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1054 ('CAFE',
1055 'cafe'),
1056 # 3.3 Case folding 8bit U+00DF (german sharp s).
1057 # The original test case is bogus; it says \xc3\xdf
1058 ('\xc3\x9f',
1059 'ss'),
1060 # 3.4 Case folding U+0130 (turkish capital I with dot).
1061 ('\xc4\xb0',
1062 'i\xcc\x87'),
1063 # 3.5 Case folding multibyte U+0143 U+037A.
1064 ('\xc5\x83\xcd\xba',
1065 '\xc5\x84 \xce\xb9'),
1066 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1067 # XXX: skip this as it fails in UCS-2 mode
1068 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1069 # 'telc\xe2\x88\x95kg\xcf\x83'),
1070 (None, None),
1071 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1072 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1073 '\xc7\xb0 a'),
1074 # 3.8 Case folding U+1FB7 and normalization.
1075 ('\xe1\xbe\xb7',
1076 '\xe1\xbe\xb6\xce\xb9'),
1077 # 3.9 Self-reverting case folding U+01F0 and normalization.
1078 # The original test case is bogus, it says `\xc7\xf0'
1079 ('\xc7\xb0',
1080 '\xc7\xb0'),
1081 # 3.10 Self-reverting case folding U+0390 and normalization.
1082 ('\xce\x90',
1083 '\xce\x90'),
1084 # 3.11 Self-reverting case folding U+03B0 and normalization.
1085 ('\xce\xb0',
1086 '\xce\xb0'),
1087 # 3.12 Self-reverting case folding U+1E96 and normalization.
1088 ('\xe1\xba\x96',
1089 '\xe1\xba\x96'),
1090 # 3.13 Self-reverting case folding U+1F56 and normalization.
1091 ('\xe1\xbd\x96',
1092 '\xe1\xbd\x96'),
1093 # 3.14 ASCII space character U+0020.
1094 (' ',
1095 ' '),
1096 # 3.15 Non-ASCII 8bit space character U+00A0.
1097 ('\xc2\xa0',
1098 ' '),
1099 # 3.16 Non-ASCII multibyte space character U+1680.
1100 ('\xe1\x9a\x80',
1101 None),
1102 # 3.17 Non-ASCII multibyte space character U+2000.
1103 ('\xe2\x80\x80',
1104 ' '),
1105 # 3.18 Zero Width Space U+200b.
1106 ('\xe2\x80\x8b',
1107 ''),
1108 # 3.19 Non-ASCII multibyte space character U+3000.
1109 ('\xe3\x80\x80',
1110 ' '),
1111 # 3.20 ASCII control characters U+0010 U+007F.
1112 ('\x10\x7f',
1113 '\x10\x7f'),
1114 # 3.21 Non-ASCII 8bit control character U+0085.
1115 ('\xc2\x85',
1116 None),
1117 # 3.22 Non-ASCII multibyte control character U+180E.
1118 ('\xe1\xa0\x8e',
1119 None),
1120 # 3.23 Zero Width No-Break Space U+FEFF.
1121 ('\xef\xbb\xbf',
1122 ''),
1123 # 3.24 Non-ASCII control character U+1D175.
1124 ('\xf0\x9d\x85\xb5',
1125 None),
1126 # 3.25 Plane 0 private use character U+F123.
1127 ('\xef\x84\xa3',
1128 None),
1129 # 3.26 Plane 15 private use character U+F1234.
1130 ('\xf3\xb1\x88\xb4',
1131 None),
1132 # 3.27 Plane 16 private use character U+10F234.
1133 ('\xf4\x8f\x88\xb4',
1134 None),
1135 # 3.28 Non-character code point U+8FFFE.
1136 ('\xf2\x8f\xbf\xbe',
1137 None),
1138 # 3.29 Non-character code point U+10FFFF.
1139 ('\xf4\x8f\xbf\xbf',
1140 None),
1141 # 3.30 Surrogate code U+DF42.
1142 ('\xed\xbd\x82',
1143 None),
1144 # 3.31 Non-plain text character U+FFFD.
1145 ('\xef\xbf\xbd',
1146 None),
1147 # 3.32 Ideographic description character U+2FF5.
1148 ('\xe2\xbf\xb5',
1149 None),
1150 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +00001151 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +00001152 '\xcc\x81'),
1153 # 3.34 Left-to-right mark U+200E.
1154 ('\xe2\x80\x8e',
1155 None),
1156 # 3.35 Deprecated U+202A.
1157 ('\xe2\x80\xaa',
1158 None),
1159 # 3.36 Language tagging character U+E0001.
1160 ('\xf3\xa0\x80\x81',
1161 None),
1162 # 3.37 Language tagging character U+E0042.
1163 ('\xf3\xa0\x81\x82',
1164 None),
1165 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1166 ('foo\xd6\xbebar',
1167 None),
1168 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1169 ('foo\xef\xb5\x90bar',
1170 None),
1171 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1172 ('foo\xef\xb9\xb6bar',
1173 'foo \xd9\x8ebar'),
1174 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1175 ('\xd8\xa71',
1176 None),
1177 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1178 ('\xd8\xa71\xd8\xa8',
1179 '\xd8\xa71\xd8\xa8'),
1180 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +00001181 # Skip this test as we allow unassigned
1182 #('\xf3\xa0\x80\x82',
1183 # None),
1184 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +00001185 # 3.44 Larger test (shrinking).
1186 # Original test case reads \xc3\xdf
1187 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1188 '\xaa\xce\xb0\xe2\x80\x80',
1189 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1190 # 3.45 Larger test (expanding).
1191 # Original test case reads \xc3\x9f
1192 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1193 '\x80',
1194 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1195 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1196 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1197 ]
1198
1199
1200class NameprepTest(unittest.TestCase):
1201 def test_nameprep(self):
1202 from encodings.idna import nameprep
1203 for pos, (orig, prepped) in enumerate(nameprep_tests):
1204 if orig is None:
1205 # Skipped
1206 continue
1207 # The Unicode strings are given in UTF-8
1208 orig = unicode(orig, "utf-8")
1209 if prepped is None:
1210 # Input contains prohibited characters
1211 self.assertRaises(UnicodeError, nameprep, orig)
1212 else:
1213 prepped = unicode(prepped, "utf-8")
1214 try:
Ezio Melotti2623a372010-11-21 13:34:58 +00001215 self.assertEqual(nameprep(orig), prepped)
Martin v. Löwis2548c732003-04-18 10:39:54 +00001216 except Exception,e:
1217 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1218
Walter Dörwald78a0be62006-04-14 18:25:39 +00001219class IDNACodecTest(unittest.TestCase):
1220 def test_builtin_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001221 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1222 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1223 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1224 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001225
1226 def test_builtin_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001227 self.assertEqual(u"python.org".encode("idna"), "python.org")
1228 self.assertEqual("python.org.".encode("idna"), "python.org.")
1229 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1230 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001231
Martin v. Löwis8b595142005-08-25 11:03:38 +00001232 def test_stream(self):
1233 import StringIO
1234 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1235 r.read(3)
Ezio Melotti2623a372010-11-21 13:34:58 +00001236 self.assertEqual(r.read(), u"")
Martin v. Löwis8b595142005-08-25 11:03:38 +00001237
Walter Dörwald78a0be62006-04-14 18:25:39 +00001238 def test_incremental_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001239 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001240 "".join(codecs.iterdecode("python.org", "idna")),
1241 u"python.org"
1242 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001243 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001244 "".join(codecs.iterdecode("python.org.", "idna")),
1245 u"python.org."
1246 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001247 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001248 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1249 u"pyth\xf6n.org."
1250 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001251 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001252 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1253 u"pyth\xf6n.org."
1254 )
1255
1256 decoder = codecs.getincrementaldecoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001257 self.assertEqual(decoder.decode("xn--xam", ), u"")
1258 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1259 self.assertEqual(decoder.decode(u"rg"), u"")
1260 self.assertEqual(decoder.decode(u"", True), u"org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001261
1262 decoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001263 self.assertEqual(decoder.decode("xn--xam", ), u"")
1264 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1265 self.assertEqual(decoder.decode("rg."), u"org.")
1266 self.assertEqual(decoder.decode("", True), u"")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001267
1268 def test_incremental_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001269 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001270 "".join(codecs.iterencode(u"python.org", "idna")),
1271 "python.org"
1272 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001273 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001274 "".join(codecs.iterencode(u"python.org.", "idna")),
1275 "python.org."
1276 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001277 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001278 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1279 "xn--pythn-mua.org."
1280 )
Ezio Melotti2623a372010-11-21 13:34:58 +00001281 self.assertEqual(
Walter Dörwald78a0be62006-04-14 18:25:39 +00001282 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1283 "xn--pythn-mua.org."
1284 )
1285
1286 encoder = codecs.getincrementalencoder("idna")()
Ezio Melotti2623a372010-11-21 13:34:58 +00001287 self.assertEqual(encoder.encode(u"\xe4x"), "")
1288 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1289 self.assertEqual(encoder.encode(u"", True), "org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001290
1291 encoder.reset()
Ezio Melotti2623a372010-11-21 13:34:58 +00001292 self.assertEqual(encoder.encode(u"\xe4x"), "")
1293 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1294 self.assertEqual(encoder.encode(u"", True), "")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001295
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001296class CodecsModuleTest(unittest.TestCase):
1297
1298 def test_decode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001299 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001300 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001301 self.assertRaises(TypeError, codecs.decode)
Ezio Melotti2623a372010-11-21 13:34:58 +00001302 self.assertEqual(codecs.decode('abc'), u'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001303 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1304
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001305 def test_encode(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001306 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001307 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001308 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001309 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Ezio Melotti2623a372010-11-21 13:34:58 +00001310 self.assertEqual(codecs.encode(u'abc'), 'abc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001311 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1312
1313 def test_register(self):
1314 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001315 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001316
1317 def test_lookup(self):
1318 self.assertRaises(TypeError, codecs.lookup)
1319 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001320 self.assertRaises(LookupError, codecs.lookup, " ")
1321
1322 def test_getencoder(self):
1323 self.assertRaises(TypeError, codecs.getencoder)
1324 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1325
1326 def test_getdecoder(self):
1327 self.assertRaises(TypeError, codecs.getdecoder)
1328 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1329
1330 def test_getreader(self):
1331 self.assertRaises(TypeError, codecs.getreader)
1332 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1333
1334 def test_getwriter(self):
1335 self.assertRaises(TypeError, codecs.getwriter)
1336 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001337
Antoine Pitrou4cfae022011-07-24 02:51:01 +02001338 def test_lookup_issue1813(self):
1339 # Issue #1813: under Turkish locales, lookup of some codecs failed
1340 # because 'I' is lowercased as a dotless "i"
1341 oldlocale = locale.getlocale(locale.LC_CTYPE)
1342 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1343 try:
1344 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1345 except locale.Error:
1346 # Unsupported locale on this system
1347 self.skipTest('test needs Turkish locale')
1348 c = codecs.lookup('ASCII')
1349 self.assertEqual(c.name, 'ascii')
1350
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001351class StreamReaderTest(unittest.TestCase):
1352
1353 def setUp(self):
1354 self.reader = codecs.getreader('utf-8')
1355 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1356
1357 def test_readlines(self):
1358 f = self.reader(self.stream)
Ezio Melotti2623a372010-11-21 13:34:58 +00001359 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001360
Georg Brandl8f99f812006-10-29 08:39:22 +00001361class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001362
Georg Brandl8f99f812006-10-29 08:39:22 +00001363 def test_basic(self):
1364 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001365 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
Ezio Melotti2623a372010-11-21 13:34:58 +00001366 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001367
1368 f = StringIO.StringIO()
1369 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1370 ef.write('\xc3\xbc')
Ezio Melotti2623a372010-11-21 13:34:58 +00001371 self.assertEqual(f.getvalue(), '\xfc')
Georg Brandl8f99f812006-10-29 08:39:22 +00001372
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001373class Str2StrTest(unittest.TestCase):
1374
1375 def test_read(self):
1376 sin = "\x80".encode("base64_codec")
1377 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1378 sout = reader.read()
1379 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001380 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001381
1382 def test_readline(self):
1383 sin = "\x80".encode("base64_codec")
1384 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1385 sout = reader.readline()
1386 self.assertEqual(sout, "\x80")
Ezio Melottib0f5adc2010-01-24 16:58:36 +00001387 self.assertIsInstance(sout, str)
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001388
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001389all_unicode_encodings = [
1390 "ascii",
1391 "base64_codec",
1392 "big5",
1393 "big5hkscs",
1394 "charmap",
1395 "cp037",
1396 "cp1006",
1397 "cp1026",
1398 "cp1140",
1399 "cp1250",
1400 "cp1251",
1401 "cp1252",
1402 "cp1253",
1403 "cp1254",
1404 "cp1255",
1405 "cp1256",
1406 "cp1257",
1407 "cp1258",
1408 "cp424",
1409 "cp437",
1410 "cp500",
Georg Brandlf0757a22010-05-24 21:29:07 +00001411 "cp720",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001412 "cp737",
1413 "cp775",
1414 "cp850",
1415 "cp852",
1416 "cp855",
1417 "cp856",
1418 "cp857",
Georg Brandlf0757a22010-05-24 21:29:07 +00001419 "cp858",
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001420 "cp860",
1421 "cp861",
1422 "cp862",
1423 "cp863",
1424 "cp864",
1425 "cp865",
1426 "cp866",
1427 "cp869",
1428 "cp874",
1429 "cp875",
1430 "cp932",
1431 "cp949",
1432 "cp950",
1433 "euc_jis_2004",
1434 "euc_jisx0213",
1435 "euc_jp",
1436 "euc_kr",
1437 "gb18030",
1438 "gb2312",
1439 "gbk",
1440 "hex_codec",
1441 "hp_roman8",
1442 "hz",
1443 "idna",
1444 "iso2022_jp",
1445 "iso2022_jp_1",
1446 "iso2022_jp_2",
1447 "iso2022_jp_2004",
1448 "iso2022_jp_3",
1449 "iso2022_jp_ext",
1450 "iso2022_kr",
1451 "iso8859_1",
1452 "iso8859_10",
1453 "iso8859_11",
1454 "iso8859_13",
1455 "iso8859_14",
1456 "iso8859_15",
1457 "iso8859_16",
1458 "iso8859_2",
1459 "iso8859_3",
1460 "iso8859_4",
1461 "iso8859_5",
1462 "iso8859_6",
1463 "iso8859_7",
1464 "iso8859_8",
1465 "iso8859_9",
1466 "johab",
1467 "koi8_r",
1468 "koi8_u",
1469 "latin_1",
1470 "mac_cyrillic",
1471 "mac_greek",
1472 "mac_iceland",
1473 "mac_latin2",
1474 "mac_roman",
1475 "mac_turkish",
1476 "palmos",
1477 "ptcp154",
1478 "punycode",
1479 "raw_unicode_escape",
1480 "rot_13",
1481 "shift_jis",
1482 "shift_jis_2004",
1483 "shift_jisx0213",
1484 "tis_620",
1485 "unicode_escape",
1486 "unicode_internal",
1487 "utf_16",
1488 "utf_16_be",
1489 "utf_16_le",
1490 "utf_7",
1491 "utf_8",
1492]
1493
1494if hasattr(codecs, "mbcs_encode"):
1495 all_unicode_encodings.append("mbcs")
1496
1497# The following encodings work only with str, not unicode
1498all_string_encodings = [
1499 "quopri_codec",
1500 "string_escape",
1501 "uu_codec",
1502]
1503
1504# The following encoding is not tested, because it's not supposed
1505# to work:
1506# "undefined"
1507
1508# The following encodings don't work in stateful mode
1509broken_unicode_with_streams = [
1510 "base64_codec",
1511 "hex_codec",
1512 "punycode",
1513 "unicode_internal"
1514]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001515broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001516
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001517# The following encodings only support "strict" mode
1518only_strict_mode = [
1519 "idna",
1520 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001521 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001522]
1523
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001524try:
1525 import bz2
1526except ImportError:
1527 pass
1528else:
1529 all_unicode_encodings.append("bz2_codec")
1530 broken_unicode_with_streams.append("bz2_codec")
1531
1532try:
1533 import zlib
1534except ImportError:
1535 pass
1536else:
1537 all_unicode_encodings.append("zlib_codec")
1538 broken_unicode_with_streams.append("zlib_codec")
1539
1540class BasicUnicodeTest(unittest.TestCase):
1541 def test_basics(self):
1542 s = u"abc123" # all codecs should be able to encode these
1543 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001544 name = codecs.lookup(encoding).name
1545 if encoding.endswith("_codec"):
1546 name += "_codec"
1547 elif encoding == "latin_1":
1548 name = "latin_1"
1549 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001550 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001551 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001552 (chars, size) = codecs.getdecoder(encoding)(bytes)
1553 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1554
1555 if encoding not in broken_unicode_with_streams:
1556 # check stream reader/writer
1557 q = Queue()
1558 writer = codecs.getwriter(encoding)(q)
1559 encodedresult = ""
1560 for c in s:
1561 writer.write(c)
1562 encodedresult += q.read()
1563 q = Queue()
1564 reader = codecs.getreader(encoding)(q)
1565 decodedresult = u""
1566 for c in encodedresult:
1567 q.write(c)
1568 decodedresult += reader.read()
1569 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1570
Georg Brandl2c9838e2006-10-29 14:39:09 +00001571 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001572 # check incremental decoder/encoder (fetched via the Python
1573 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001574 try:
1575 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001576 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001577 except LookupError: # no IncrementalEncoder
1578 pass
1579 else:
1580 # check incremental decoder/encoder
1581 encodedresult = ""
1582 for c in s:
1583 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001584 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001585 decoder = codecs.getincrementaldecoder(encoding)()
1586 decodedresult = u""
1587 for c in encodedresult:
1588 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001589 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001590 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1591
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001592 # check C API
1593 encodedresult = ""
1594 for c in s:
1595 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001596 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001597 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1598 decodedresult = u""
1599 for c in encodedresult:
1600 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001601 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001602 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1603
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001604 # check iterencode()/iterdecode()
1605 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1606 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1607
1608 # check iterencode()/iterdecode() with empty string
1609 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1610 self.assertEqual(result, u"")
1611
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001612 if encoding not in only_strict_mode:
1613 # check incremental decoder/encoder with errors argument
1614 try:
1615 encoder = codecs.getincrementalencoder(encoding)("ignore")
1616 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1617 except LookupError: # no IncrementalEncoder
1618 pass
1619 else:
1620 encodedresult = "".join(encoder.encode(c) for c in s)
1621 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1622 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1623 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001624
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001625 encodedresult = "".join(cencoder.encode(c) for c in s)
1626 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1627 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1628 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1629
Walter Dörwald729c31f2005-03-14 19:06:30 +00001630 def test_seek(self):
1631 # all codecs should be able to encode these
1632 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1633 for encoding in all_unicode_encodings:
1634 if encoding == "idna": # FIXME: See SF bug #1163178
1635 continue
1636 if encoding in broken_unicode_with_streams:
1637 continue
1638 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1639 for t in xrange(5):
1640 # Test that calling seek resets the internal codec state and buffers
1641 reader.seek(0, 0)
1642 line = reader.readline()
1643 self.assertEqual(s[:len(line)], line)
1644
Walter Dörwalde22d3392005-11-17 08:52:34 +00001645 def test_bad_decode_args(self):
1646 for encoding in all_unicode_encodings:
1647 decoder = codecs.getdecoder(encoding)
1648 self.assertRaises(TypeError, decoder)
1649 if encoding not in ("idna", "punycode"):
1650 self.assertRaises(TypeError, decoder, 42)
1651
1652 def test_bad_encode_args(self):
1653 for encoding in all_unicode_encodings:
1654 encoder = codecs.getencoder(encoding)
1655 self.assertRaises(TypeError, encoder)
1656
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001657 def test_encoding_map_type_initialized(self):
1658 from encodings import cp1140
1659 # This used to crash, we are only verifying there's no crash.
1660 table_type = type(cp1140.encoding_table)
1661 self.assertEqual(table_type, table_type)
1662
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001663class BasicStrTest(unittest.TestCase):
1664 def test_basics(self):
1665 s = "abc123"
1666 for encoding in all_string_encodings:
1667 (bytes, size) = codecs.getencoder(encoding)(s)
1668 self.assertEqual(size, len(s))
1669 (chars, size) = codecs.getdecoder(encoding)(bytes)
1670 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1671
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001672class CharmapTest(unittest.TestCase):
1673 def test_decode_with_string_map(self):
Ezio Melotti2623a372010-11-21 13:34:58 +00001674 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001675 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1676 (u"abc", 3)
1677 )
1678
Serhiy Storchaka95997452013-01-15 14:42:59 +02001679 self.assertRaises(UnicodeDecodeError,
1680 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1681 )
1682
1683 self.assertRaises(UnicodeDecodeError,
1684 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1685 )
1686
Ezio Melotti2623a372010-11-21 13:34:58 +00001687 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001688 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1689 (u"ab\ufffd", 3)
1690 )
1691
Ezio Melotti2623a372010-11-21 13:34:58 +00001692 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001693 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1694 (u"ab\ufffd", 3)
1695 )
1696
Ezio Melotti2623a372010-11-21 13:34:58 +00001697 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001698 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1699 (u"ab", 3)
1700 )
1701
Ezio Melotti2623a372010-11-21 13:34:58 +00001702 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001703 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1704 (u"ab", 3)
1705 )
1706
1707 allbytes = "".join(chr(i) for i in xrange(256))
Ezio Melotti2623a372010-11-21 13:34:58 +00001708 self.assertEqual(
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001709 codecs.charmap_decode(allbytes, "ignore", u""),
1710 (u"", len(allbytes))
1711 )
1712
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001713 def test_decode_with_int2str_map(self):
1714 self.assertEqual(
1715 codecs.charmap_decode("\x00\x01\x02", "strict",
1716 {0: u'a', 1: u'b', 2: u'c'}),
1717 (u"abc", 3)
1718 )
1719
1720 self.assertEqual(
1721 codecs.charmap_decode("\x00\x01\x02", "strict",
1722 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1723 (u"AaBbCc", 3)
1724 )
1725
1726 self.assertEqual(
1727 codecs.charmap_decode("\x00\x01\x02", "strict",
1728 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1729 (u"\U0010FFFFbc", 3)
1730 )
1731
1732 self.assertEqual(
1733 codecs.charmap_decode("\x00\x01\x02", "strict",
1734 {0: u'a', 1: u'b', 2: u''}),
1735 (u"ab", 3)
1736 )
1737
1738 self.assertRaises(UnicodeDecodeError,
1739 codecs.charmap_decode, "\x00\x01\x02", "strict",
1740 {0: u'a', 1: u'b'}
1741 )
1742
Serhiy Storchaka95997452013-01-15 14:42:59 +02001743 self.assertRaises(UnicodeDecodeError,
1744 codecs.charmap_decode, "\x00\x01\x02", "strict",
1745 {0: u'a', 1: u'b', 2: None}
1746 )
1747
1748 # Issue #14850
1749 self.assertRaises(UnicodeDecodeError,
1750 codecs.charmap_decode, "\x00\x01\x02", "strict",
1751 {0: u'a', 1: u'b', 2: u'\ufffe'}
1752 )
1753
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001754 self.assertEqual(
1755 codecs.charmap_decode("\x00\x01\x02", "replace",
1756 {0: u'a', 1: u'b'}),
1757 (u"ab\ufffd", 3)
1758 )
1759
1760 self.assertEqual(
1761 codecs.charmap_decode("\x00\x01\x02", "replace",
1762 {0: u'a', 1: u'b', 2: None}),
1763 (u"ab\ufffd", 3)
1764 )
1765
Serhiy Storchaka95997452013-01-15 14:42:59 +02001766 # Issue #14850
1767 self.assertEqual(
1768 codecs.charmap_decode("\x00\x01\x02", "replace",
1769 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1770 (u"ab\ufffd", 3)
1771 )
1772
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001773 self.assertEqual(
1774 codecs.charmap_decode("\x00\x01\x02", "ignore",
1775 {0: u'a', 1: u'b'}),
1776 (u"ab", 3)
1777 )
1778
1779 self.assertEqual(
1780 codecs.charmap_decode("\x00\x01\x02", "ignore",
1781 {0: u'a', 1: u'b', 2: None}),
1782 (u"ab", 3)
1783 )
1784
Serhiy Storchaka95997452013-01-15 14:42:59 +02001785 # Issue #14850
1786 self.assertEqual(
1787 codecs.charmap_decode("\x00\x01\x02", "ignore",
1788 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1789 (u"ab", 3)
1790 )
1791
1792 allbytes = "".join(chr(i) for i in xrange(256))
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001793 self.assertEqual(
1794 codecs.charmap_decode(allbytes, "ignore", {}),
1795 (u"", len(allbytes))
1796 )
1797
1798 def test_decode_with_int2int_map(self):
1799 a = ord(u'a')
1800 b = ord(u'b')
1801 c = ord(u'c')
1802
1803 self.assertEqual(
1804 codecs.charmap_decode("\x00\x01\x02", "strict",
1805 {0: a, 1: b, 2: c}),
1806 (u"abc", 3)
1807 )
1808
1809 # Issue #15379
1810 self.assertEqual(
1811 codecs.charmap_decode("\x00\x01\x02", "strict",
1812 {0: 0x10FFFF, 1: b, 2: c}),
1813 (u"\U0010FFFFbc", 3)
1814 )
1815
1816 self.assertRaises(TypeError,
1817 codecs.charmap_decode, "\x00\x01\x02", "strict",
1818 {0: 0x110000, 1: b, 2: c}
1819 )
1820
1821 self.assertRaises(UnicodeDecodeError,
1822 codecs.charmap_decode, "\x00\x01\x02", "strict",
1823 {0: a, 1: b},
1824 )
1825
Serhiy Storchaka95997452013-01-15 14:42:59 +02001826 self.assertRaises(UnicodeDecodeError,
1827 codecs.charmap_decode, "\x00\x01\x02", "strict",
1828 {0: a, 1: b, 2: 0xFFFE},
1829 )
1830
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001831 self.assertEqual(
1832 codecs.charmap_decode("\x00\x01\x02", "replace",
1833 {0: a, 1: b}),
1834 (u"ab\ufffd", 3)
1835 )
1836
1837 self.assertEqual(
Serhiy Storchaka95997452013-01-15 14:42:59 +02001838 codecs.charmap_decode("\x00\x01\x02", "replace",
1839 {0: a, 1: b, 2: 0xFFFE}),
1840 (u"ab\ufffd", 3)
1841 )
1842
1843 self.assertEqual(
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001844 codecs.charmap_decode("\x00\x01\x02", "ignore",
1845 {0: a, 1: b}),
1846 (u"ab", 3)
1847 )
1848
Serhiy Storchaka95997452013-01-15 14:42:59 +02001849 self.assertEqual(
1850 codecs.charmap_decode("\x00\x01\x02", "ignore",
1851 {0: a, 1: b, 2: 0xFFFE}),
1852 (u"ab", 3)
1853 )
1854
Antoine Pitroue3ae3212012-11-17 21:14:58 +01001855
Georg Brandl8f99f812006-10-29 08:39:22 +00001856class WithStmtTest(unittest.TestCase):
1857 def test_encodedfile(self):
1858 f = StringIO.StringIO("\xc3\xbc")
1859 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
Ezio Melotti2623a372010-11-21 13:34:58 +00001860 self.assertEqual(ef.read(), "\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001861
1862 def test_streamreaderwriter(self):
1863 f = StringIO.StringIO("\xc3\xbc")
1864 info = codecs.lookup("utf-8")
1865 with codecs.StreamReaderWriter(f, info.streamreader,
1866 info.streamwriter, 'strict') as srw:
Ezio Melotti2623a372010-11-21 13:34:58 +00001867 self.assertEqual(srw.read(), u"\xfc")
Georg Brandl8f99f812006-10-29 08:39:22 +00001868
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001869
Serhiy Storchakac8e58122013-01-29 10:20:34 +02001870class UnicodeEscapeTest(unittest.TestCase):
1871 def test_empty(self):
1872 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1873 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1874
1875 def test_raw_encode(self):
1876 encode = codecs.unicode_escape_encode
1877 for b in range(32, 127):
1878 if b != ord('\\'):
1879 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1880
1881 def test_raw_decode(self):
1882 decode = codecs.unicode_escape_decode
1883 for b in range(256):
1884 if b != ord('\\'):
1885 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1886
1887 def test_escape_encode(self):
1888 encode = codecs.unicode_escape_encode
1889 check = coding_checker(self, encode)
1890 check(u'\t', r'\t')
1891 check(u'\n', r'\n')
1892 check(u'\r', r'\r')
1893 check(u'\\', r'\\')
1894 for b in range(32):
1895 if chr(b) not in '\t\n\r':
1896 check(unichr(b), '\\x%02x' % b)
1897 for b in range(127, 256):
1898 check(unichr(b), '\\x%02x' % b)
1899 check(u'\u20ac', r'\u20ac')
1900 check(u'\U0001d120', r'\U0001d120')
1901
1902 def test_escape_decode(self):
1903 decode = codecs.unicode_escape_decode
1904 check = coding_checker(self, decode)
1905 check("[\\\n]", u"[]")
1906 check(r'[\"]', u'["]')
1907 check(r"[\']", u"[']")
1908 check(r"[\\]", ur"[\]")
1909 check(r"[\a]", u"[\x07]")
1910 check(r"[\b]", u"[\x08]")
1911 check(r"[\t]", u"[\x09]")
1912 check(r"[\n]", u"[\x0a]")
1913 check(r"[\v]", u"[\x0b]")
1914 check(r"[\f]", u"[\x0c]")
1915 check(r"[\r]", u"[\x0d]")
1916 check(r"[\7]", u"[\x07]")
1917 check(r"[\8]", ur"[\8]")
1918 check(r"[\78]", u"[\x078]")
1919 check(r"[\41]", u"[!]")
1920 check(r"[\418]", u"[!8]")
1921 check(r"[\101]", u"[A]")
1922 check(r"[\1010]", u"[A0]")
1923 check(r"[\x41]", u"[A]")
1924 check(r"[\x410]", u"[A0]")
1925 check(r"\u20ac", u"\u20ac")
1926 check(r"\U0001d120", u"\U0001d120")
1927 for b in range(256):
1928 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1929 check('\\' + chr(b), u'\\' + unichr(b))
1930
1931 def test_decode_errors(self):
1932 decode = codecs.unicode_escape_decode
1933 for c, d in ('x', 2), ('u', 4), ('U', 4):
1934 for i in range(d):
1935 self.assertRaises(UnicodeDecodeError, decode,
1936 "\\" + c + "0"*i)
1937 self.assertRaises(UnicodeDecodeError, decode,
1938 "[\\" + c + "0"*i + "]")
1939 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1940 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1941 self.assertEqual(decode(data, "replace"),
1942 (u"[\ufffd]\ufffd", len(data)))
1943 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1944 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1945 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1946
1947
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02001948class RawUnicodeEscapeTest(unittest.TestCase):
1949 def test_empty(self):
1950 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1951 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1952
1953 def test_raw_encode(self):
1954 encode = codecs.raw_unicode_escape_encode
1955 for b in range(256):
1956 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1957
1958 def test_raw_decode(self):
1959 decode = codecs.raw_unicode_escape_decode
1960 for b in range(256):
1961 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1962
1963 def test_escape_encode(self):
1964 encode = codecs.raw_unicode_escape_encode
1965 check = coding_checker(self, encode)
1966 for b in range(256):
1967 if chr(b) not in 'uU':
1968 check(u'\\' + unichr(b), '\\' + chr(b))
1969 check(u'\u20ac', r'\u20ac')
1970 check(u'\U0001d120', r'\U0001d120')
1971
1972 def test_escape_decode(self):
1973 decode = codecs.raw_unicode_escape_decode
1974 check = coding_checker(self, decode)
1975 for b in range(256):
1976 if chr(b) not in 'uU':
1977 check('\\' + chr(b), u'\\' + unichr(b))
1978 check(r"\u20ac", u"\u20ac")
1979 check(r"\U0001d120", u"\U0001d120")
1980
1981 def test_decode_errors(self):
1982 decode = codecs.raw_unicode_escape_decode
1983 for c, d in ('u', 4), ('U', 4):
1984 for i in range(d):
1985 self.assertRaises(UnicodeDecodeError, decode,
1986 "\\" + c + "0"*i)
1987 self.assertRaises(UnicodeDecodeError, decode,
1988 "[\\" + c + "0"*i + "]")
1989 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1990 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1991 self.assertEqual(decode(data, "replace"),
1992 (u"[\ufffd]\ufffd", len(data)))
1993 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1994 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1995 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1996
1997
Victor Stinner262be5e2010-05-22 02:11:07 +00001998class BomTest(unittest.TestCase):
1999 def test_seek0(self):
Victor Stinner7df55da2010-05-22 13:37:56 +00002000 data = u"1234567890"
Victor Stinner262be5e2010-05-22 02:11:07 +00002001 tests = ("utf-16",
2002 "utf-16-le",
2003 "utf-16-be",
2004 "utf-32",
2005 "utf-32-le",
2006 "utf-32-be")
Victor Stinner6c603c42011-05-23 16:19:31 +02002007 self.addCleanup(test_support.unlink, test_support.TESTFN)
Victor Stinner262be5e2010-05-22 02:11:07 +00002008 for encoding in tests:
Victor Stinner7df55da2010-05-22 13:37:56 +00002009 # Check if the BOM is written only once
2010 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
Victor Stinner262be5e2010-05-22 02:11:07 +00002011 f.write(data)
2012 f.write(data)
2013 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002014 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002015 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002016 self.assertEqual(f.read(), data * 2)
Victor Stinner262be5e2010-05-22 02:11:07 +00002017
Victor Stinner7df55da2010-05-22 13:37:56 +00002018 # Check that the BOM is written after a seek(0)
2019 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2020 f.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002021 self.assertNotEqual(f.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002022 f.seek(0)
2023 f.write(data)
2024 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002025 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002026
2027 # (StreamWriter) Check that the BOM is written after a seek(0)
2028 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2029 f.writer.write(data[0])
Ezio Melotti2623a372010-11-21 13:34:58 +00002030 self.assertNotEqual(f.writer.tell(), 0)
Victor Stinner7df55da2010-05-22 13:37:56 +00002031 f.writer.seek(0)
2032 f.writer.write(data)
2033 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002034 self.assertEqual(f.read(), data)
Victor Stinner7df55da2010-05-22 13:37:56 +00002035
2036 # Check that the BOM is not written after a seek() at a position
2037 # different than the start
2038 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2039 f.write(data)
2040 f.seek(f.tell())
2041 f.write(data)
2042 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002043 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002044
2045 # (StreamWriter) Check that the BOM is not written after a seek()
2046 # at a position different than the start
2047 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2048 f.writer.write(data)
2049 f.writer.seek(f.writer.tell())
2050 f.writer.write(data)
2051 f.seek(0)
Ezio Melotti2623a372010-11-21 13:34:58 +00002052 self.assertEqual(f.read(), data * 2)
Victor Stinner7df55da2010-05-22 13:37:56 +00002053
Victor Stinner262be5e2010-05-22 02:11:07 +00002054
Fred Drake2e2be372001-09-20 21:33:42 +00002055def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00002056 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00002057 UTF32Test,
2058 UTF32LETest,
2059 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002060 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00002061 UTF16LETest,
2062 UTF16BETest,
2063 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00002064 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00002065 UTF7Test,
2066 UTF16ExTest,
2067 ReadBufferTest,
2068 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002069 EscapeDecodeTest,
2070 RecodingTest,
2071 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002072 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00002073 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00002074 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00002075 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002076 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002077 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00002078 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00002079 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002080 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00002081 CharmapTest,
2082 WithStmtTest,
Serhiy Storchakac8e58122013-01-29 10:20:34 +02002083 UnicodeEscapeTest,
Serhiy Storchaka74e449f2013-01-29 11:39:44 +02002084 RawUnicodeEscapeTest,
Victor Stinner262be5e2010-05-22 02:11:07 +00002085 BomTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00002086 )
Fred Drake2e2be372001-09-20 21:33:42 +00002087
2088
2089if __name__ == "__main__":
2090 test_main()