blob: 7c3a30b9f44b811f3c5da841e3078388107ed42a [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assert_(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
Georg Brandlf7a09be2009-09-17 11:33:31 +0000308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
313
Walter Dörwald6e390802007-08-17 16:41:28 +0000314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
317
318class UTF32LETest(ReadTest):
319 encoding = "utf-32-le"
320
321 def test_partial(self):
322 self.check_partial(
323 u"\x00\xff\u0100\uffff",
324 [
325 u"",
326 u"",
327 u"",
328 u"\x00",
329 u"\x00",
330 u"\x00",
331 u"\x00",
332 u"\x00\xff",
333 u"\x00\xff",
334 u"\x00\xff",
335 u"\x00\xff",
336 u"\x00\xff\u0100",
337 u"\x00\xff\u0100",
338 u"\x00\xff\u0100",
339 u"\x00\xff\u0100",
340 u"\x00\xff\u0100\uffff",
341 ]
342 )
343
344 def test_simple(self):
345 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
346
347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
349 "\xff", "strict", True)
350
351class UTF32BETest(ReadTest):
352 encoding = "utf-32-be"
353
354 def test_partial(self):
355 self.check_partial(
356 u"\x00\xff\u0100\uffff",
357 [
358 u"",
359 u"",
360 u"",
361 u"\x00",
362 u"\x00",
363 u"\x00",
364 u"\x00",
365 u"\x00\xff",
366 u"\x00\xff",
367 u"\x00\xff",
368 u"\x00\xff",
369 u"\x00\xff\u0100",
370 u"\x00\xff\u0100",
371 u"\x00\xff\u0100",
372 u"\x00\xff\u0100",
373 u"\x00\xff\u0100\uffff",
374 ]
375 )
376
377 def test_simple(self):
378 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
379
380 def test_errors(self):
381 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
382 "\xff", "strict", True)
383
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000384class UTF16Test(ReadTest):
385 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386
387 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
388 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
389
390 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000391 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000392 # encode some stream
393 s = StringIO.StringIO()
394 f = writer(s)
395 f.write(u"spam")
396 f.write(u"spam")
397 d = s.getvalue()
398 # check whether there is exactly one BOM in it
399 self.assert_(d == self.spamle or d == self.spambe)
400 # try to read it back
401 s = StringIO.StringIO(d)
402 f = reader(s)
403 self.assertEquals(f.read(), u"spamspam")
404
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000405 def test_badbom(self):
406 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000407 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000408 self.assertRaises(UnicodeError, f.read)
409
410 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000412 self.assertRaises(UnicodeError, f.read)
413
Walter Dörwald69652032004-09-07 20:24:22 +0000414 def test_partial(self):
415 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000416 u"\x00\xff\u0100\uffff",
417 [
418 u"", # first byte of BOM read
419 u"", # second byte of BOM read => byteorder known
420 u"",
421 u"\x00",
422 u"\x00",
423 u"\x00\xff",
424 u"\x00\xff",
425 u"\x00\xff\u0100",
426 u"\x00\xff\u0100",
427 u"\x00\xff\u0100\uffff",
428 ]
429 )
430
Georg Brandlf7a09be2009-09-17 11:33:31 +0000431 def test_handlers(self):
432 self.assertEqual((u'\ufffd', 1),
433 codecs.utf_16_decode('\x01', 'replace', True))
434 self.assertEqual((u'', 1),
435 codecs.utf_16_decode('\x01', 'ignore', True))
436
Walter Dörwalde22d3392005-11-17 08:52:34 +0000437 def test_errors(self):
438 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
439
Florent Xiclunac92fc822010-02-27 11:26:58 +0000440 def test_bug691291(self):
441 # Files are always opened in binary mode, even if no binary mode was
442 # specified. This means that no automatic conversion of '\n' is done
443 # on reading and writing.
444 s1 = u'Hello\r\nworld\r\n'
445
446 s = s1.encode(self.encoding)
447 try:
448 with open(test_support.TESTFN, 'wb') as fp:
449 fp.write(s)
450 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
451 self.assertEqual(reader.read(), s1)
452 finally:
453 test_support.unlink(test_support.TESTFN)
454
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455class UTF16LETest(ReadTest):
456 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000457
458 def test_partial(self):
459 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000460 u"\x00\xff\u0100\uffff",
461 [
462 u"",
463 u"\x00",
464 u"\x00",
465 u"\x00\xff",
466 u"\x00\xff",
467 u"\x00\xff\u0100",
468 u"\x00\xff\u0100",
469 u"\x00\xff\u0100\uffff",
470 ]
471 )
472
Walter Dörwalde22d3392005-11-17 08:52:34 +0000473 def test_errors(self):
474 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
475
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000476class UTF16BETest(ReadTest):
477 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000478
479 def test_partial(self):
480 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000481 u"\x00\xff\u0100\uffff",
482 [
483 u"",
484 u"\x00",
485 u"\x00",
486 u"\x00\xff",
487 u"\x00\xff",
488 u"\x00\xff\u0100",
489 u"\x00\xff\u0100",
490 u"\x00\xff\u0100\uffff",
491 ]
492 )
493
Walter Dörwalde22d3392005-11-17 08:52:34 +0000494 def test_errors(self):
495 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
496
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000497class UTF8Test(ReadTest):
498 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000499
500 def test_partial(self):
501 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000502 u"\x00\xff\u07ff\u0800\uffff",
503 [
504 u"\x00",
505 u"\x00",
506 u"\x00\xff",
507 u"\x00\xff",
508 u"\x00\xff\u07ff",
509 u"\x00\xff\u07ff",
510 u"\x00\xff\u07ff",
511 u"\x00\xff\u07ff\u0800",
512 u"\x00\xff\u07ff\u0800",
513 u"\x00\xff\u07ff\u0800",
514 u"\x00\xff\u07ff\u0800\uffff",
515 ]
516 )
517
Walter Dörwalde22d3392005-11-17 08:52:34 +0000518class UTF7Test(ReadTest):
519 encoding = "utf-7"
520
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000521 def test_partial(self):
522 self.check_partial(
523 u"a+-b",
524 [
525 u"a",
526 u"a",
527 u"a+",
528 u"a+-",
529 u"a+-b",
530 ]
531 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000532
533class UTF16ExTest(unittest.TestCase):
534
535 def test_errors(self):
536 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
537
538 def test_bad_args(self):
539 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
540
541class ReadBufferTest(unittest.TestCase):
542
543 def test_array(self):
544 import array
545 self.assertEqual(
546 codecs.readbuffer_encode(array.array("c", "spam")),
547 ("spam", 4)
548 )
549
550 def test_empty(self):
551 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
552
553 def test_bad_args(self):
554 self.assertRaises(TypeError, codecs.readbuffer_encode)
555 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
556
557class CharBufferTest(unittest.TestCase):
558
559 def test_string(self):
560 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
561
562 def test_empty(self):
563 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
564
565 def test_bad_args(self):
566 self.assertRaises(TypeError, codecs.charbuffer_encode)
567 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
568
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000569class UTF8SigTest(ReadTest):
570 encoding = "utf-8-sig"
571
572 def test_partial(self):
573 self.check_partial(
574 u"\ufeff\x00\xff\u07ff\u0800\uffff",
575 [
576 u"",
577 u"",
578 u"", # First BOM has been read and skipped
579 u"",
580 u"",
581 u"\ufeff", # Second BOM has been read and emitted
582 u"\ufeff\x00", # "\x00" read and emitted
583 u"\ufeff\x00", # First byte of encoded u"\xff" read
584 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
585 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
586 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
587 u"\ufeff\x00\xff\u07ff",
588 u"\ufeff\x00\xff\u07ff",
589 u"\ufeff\x00\xff\u07ff\u0800",
590 u"\ufeff\x00\xff\u07ff\u0800",
591 u"\ufeff\x00\xff\u07ff\u0800",
592 u"\ufeff\x00\xff\u07ff\u0800\uffff",
593 ]
594 )
595
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000596 def test_bug1601501(self):
597 # SF bug #1601501: check that the codec works with a buffer
598 unicode("\xef\xbb\xbf", "utf-8-sig")
599
Walter Dörwald42348272007-04-12 10:35:00 +0000600 def test_bom(self):
601 d = codecs.getincrementaldecoder("utf-8-sig")()
602 s = u"spam"
603 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
604
Walter Dörwald183744d2007-11-19 12:41:10 +0000605 def test_stream_bom(self):
606 unistring = u"ABC\u00A1\u2200XYZ"
607 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
608
609 reader = codecs.getreader("utf-8-sig")
610 for sizehint in [None] + range(1, 11) + \
611 [64, 128, 256, 512, 1024]:
612 istream = reader(StringIO.StringIO(bytestring))
613 ostream = StringIO.StringIO()
614 while 1:
615 if sizehint is not None:
616 data = istream.read(sizehint)
617 else:
618 data = istream.read()
619
620 if not data:
621 break
622 ostream.write(data)
623
624 got = ostream.getvalue()
625 self.assertEqual(got, unistring)
626
627 def test_stream_bare(self):
628 unistring = u"ABC\u00A1\u2200XYZ"
629 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
630
631 reader = codecs.getreader("utf-8-sig")
632 for sizehint in [None] + range(1, 11) + \
633 [64, 128, 256, 512, 1024]:
634 istream = reader(StringIO.StringIO(bytestring))
635 ostream = StringIO.StringIO()
636 while 1:
637 if sizehint is not None:
638 data = istream.read(sizehint)
639 else:
640 data = istream.read()
641
642 if not data:
643 break
644 ostream.write(data)
645
646 got = ostream.getvalue()
647 self.assertEqual(got, unistring)
648
Walter Dörwald8709a422002-09-03 13:53:40 +0000649class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000650 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000651 self.assertEquals(codecs.escape_decode(""), ("", 0))
652
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000653class RecodingTest(unittest.TestCase):
654 def test_recoding(self):
655 f = StringIO.StringIO()
656 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
657 f2.write(u"a")
658 f2.close()
659 # Python used to crash on this at exit because of a refcount
660 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000661
Martin v. Löwis2548c732003-04-18 10:39:54 +0000662# From RFC 3492
663punycode_testcases = [
664 # A Arabic (Egyptian):
665 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
666 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
667 "egbpdaj6bu4bxfgehfvwxn"),
668 # B Chinese (simplified):
669 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
670 "ihqwcrb4cv8a8dqg056pqjye"),
671 # C Chinese (traditional):
672 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
673 "ihqwctvzc91f659drss3x8bo0yb"),
674 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
675 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
676 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
677 u"\u0065\u0073\u006B\u0079",
678 "Proprostnemluvesky-uyb24dma41a"),
679 # E Hebrew:
680 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
681 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
682 u"\u05D1\u05E8\u05D9\u05EA",
683 "4dbcagdahymbxekheh6e0a7fei0b"),
684 # F Hindi (Devanagari):
685 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
686 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
687 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
688 u"\u0939\u0948\u0902",
689 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
690
691 #(G) Japanese (kanji and hiragana):
692 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
693 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
694 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
695
696 # (H) Korean (Hangul syllables):
697 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
698 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
699 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
700 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
701 "psd879ccm6fea98c"),
702
703 # (I) Russian (Cyrillic):
704 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
705 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
706 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
707 u"\u0438",
708 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
709
710 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
711 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
712 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
713 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
714 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
715 u"\u0061\u00F1\u006F\u006C",
716 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
717
718 # (K) Vietnamese:
719 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
720 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
721 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
722 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
723 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
724 u"\u0056\u0069\u1EC7\u0074",
725 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
726
Martin v. Löwis2548c732003-04-18 10:39:54 +0000727 #(L) 3<nen>B<gumi><kinpachi><sensei>
728 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
729 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000730
Martin v. Löwis2548c732003-04-18 10:39:54 +0000731 # (M) <amuro><namie>-with-SUPER-MONKEYS
732 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
733 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
734 u"\u004F\u004E\u004B\u0045\u0059\u0053",
735 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
736
737 # (N) Hello-Another-Way-<sorezore><no><basho>
738 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
739 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
740 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
741 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
742
743 # (O) <hitotsu><yane><no><shita>2
744 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
745 "2-u9tlzr9756bt3uc0v"),
746
747 # (P) Maji<de>Koi<suru>5<byou><mae>
748 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
749 u"\u308B\u0035\u79D2\u524D",
750 "MajiKoi5-783gue6qz075azm5e"),
751
752 # (Q) <pafii>de<runba>
753 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
754 "de-jg4avhby1noc0d"),
755
756 # (R) <sono><supiido><de>
757 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
758 "d9juau41awczczp"),
759
760 # (S) -> $1.00 <-
761 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
762 u"\u003C\u002D",
763 "-> $1.00 <--")
764 ]
765
766for i in punycode_testcases:
767 if len(i)!=2:
768 print repr(i)
769
770class PunycodeTest(unittest.TestCase):
771 def test_encode(self):
772 for uni, puny in punycode_testcases:
773 # Need to convert both strings to lower case, since
774 # some of the extended encodings use upper case, but our
775 # code produces only lower case. Converting just puny to
776 # lower is also insufficient, since some of the input characters
777 # are upper case.
778 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
779
780 def test_decode(self):
781 for uni, puny in punycode_testcases:
782 self.assertEquals(uni, puny.decode("punycode"))
783
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000784class UnicodeInternalTest(unittest.TestCase):
785 def test_bug1251300(self):
786 # Decoding with unicode_internal used to not correctly handle "code
787 # points" above 0x10ffff on UCS-4 builds.
788 if sys.maxunicode > 0xffff:
789 ok = [
790 ("\x00\x10\xff\xff", u"\U0010ffff"),
791 ("\x00\x00\x01\x01", u"\U00000101"),
792 ("", u""),
793 ]
794 not_ok = [
795 "\x7f\xff\xff\xff",
796 "\x80\x00\x00\x00",
797 "\x81\x00\x00\x00",
798 "\x00",
799 "\x00\x00\x00\x00\x00",
800 ]
801 for internal, uni in ok:
802 if sys.byteorder == "little":
803 internal = "".join(reversed(internal))
804 self.assertEquals(uni, internal.decode("unicode_internal"))
805 for internal in not_ok:
806 if sys.byteorder == "little":
807 internal = "".join(reversed(internal))
808 self.assertRaises(UnicodeDecodeError, internal.decode,
809 "unicode_internal")
810
811 def test_decode_error_attributes(self):
812 if sys.maxunicode > 0xffff:
813 try:
814 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
815 except UnicodeDecodeError, ex:
816 self.assertEquals("unicode_internal", ex.encoding)
817 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
818 self.assertEquals(4, ex.start)
819 self.assertEquals(8, ex.end)
820 else:
821 self.fail()
822
823 def test_decode_callback(self):
824 if sys.maxunicode > 0xffff:
825 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
826 decoder = codecs.getdecoder("unicode_internal")
827 ab = u"ab".encode("unicode_internal")
828 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
829 "UnicodeInternalTest")
830 self.assertEquals((u"ab", 12), ignored)
831
Martin v. Löwis2548c732003-04-18 10:39:54 +0000832# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
833nameprep_tests = [
834 # 3.1 Map to nothing.
835 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
836 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
837 '\xb8\x8f\xef\xbb\xbf',
838 'foobarbaz'),
839 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
840 ('CAFE',
841 'cafe'),
842 # 3.3 Case folding 8bit U+00DF (german sharp s).
843 # The original test case is bogus; it says \xc3\xdf
844 ('\xc3\x9f',
845 'ss'),
846 # 3.4 Case folding U+0130 (turkish capital I with dot).
847 ('\xc4\xb0',
848 'i\xcc\x87'),
849 # 3.5 Case folding multibyte U+0143 U+037A.
850 ('\xc5\x83\xcd\xba',
851 '\xc5\x84 \xce\xb9'),
852 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
853 # XXX: skip this as it fails in UCS-2 mode
854 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
855 # 'telc\xe2\x88\x95kg\xcf\x83'),
856 (None, None),
857 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
858 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
859 '\xc7\xb0 a'),
860 # 3.8 Case folding U+1FB7 and normalization.
861 ('\xe1\xbe\xb7',
862 '\xe1\xbe\xb6\xce\xb9'),
863 # 3.9 Self-reverting case folding U+01F0 and normalization.
864 # The original test case is bogus, it says `\xc7\xf0'
865 ('\xc7\xb0',
866 '\xc7\xb0'),
867 # 3.10 Self-reverting case folding U+0390 and normalization.
868 ('\xce\x90',
869 '\xce\x90'),
870 # 3.11 Self-reverting case folding U+03B0 and normalization.
871 ('\xce\xb0',
872 '\xce\xb0'),
873 # 3.12 Self-reverting case folding U+1E96 and normalization.
874 ('\xe1\xba\x96',
875 '\xe1\xba\x96'),
876 # 3.13 Self-reverting case folding U+1F56 and normalization.
877 ('\xe1\xbd\x96',
878 '\xe1\xbd\x96'),
879 # 3.14 ASCII space character U+0020.
880 (' ',
881 ' '),
882 # 3.15 Non-ASCII 8bit space character U+00A0.
883 ('\xc2\xa0',
884 ' '),
885 # 3.16 Non-ASCII multibyte space character U+1680.
886 ('\xe1\x9a\x80',
887 None),
888 # 3.17 Non-ASCII multibyte space character U+2000.
889 ('\xe2\x80\x80',
890 ' '),
891 # 3.18 Zero Width Space U+200b.
892 ('\xe2\x80\x8b',
893 ''),
894 # 3.19 Non-ASCII multibyte space character U+3000.
895 ('\xe3\x80\x80',
896 ' '),
897 # 3.20 ASCII control characters U+0010 U+007F.
898 ('\x10\x7f',
899 '\x10\x7f'),
900 # 3.21 Non-ASCII 8bit control character U+0085.
901 ('\xc2\x85',
902 None),
903 # 3.22 Non-ASCII multibyte control character U+180E.
904 ('\xe1\xa0\x8e',
905 None),
906 # 3.23 Zero Width No-Break Space U+FEFF.
907 ('\xef\xbb\xbf',
908 ''),
909 # 3.24 Non-ASCII control character U+1D175.
910 ('\xf0\x9d\x85\xb5',
911 None),
912 # 3.25 Plane 0 private use character U+F123.
913 ('\xef\x84\xa3',
914 None),
915 # 3.26 Plane 15 private use character U+F1234.
916 ('\xf3\xb1\x88\xb4',
917 None),
918 # 3.27 Plane 16 private use character U+10F234.
919 ('\xf4\x8f\x88\xb4',
920 None),
921 # 3.28 Non-character code point U+8FFFE.
922 ('\xf2\x8f\xbf\xbe',
923 None),
924 # 3.29 Non-character code point U+10FFFF.
925 ('\xf4\x8f\xbf\xbf',
926 None),
927 # 3.30 Surrogate code U+DF42.
928 ('\xed\xbd\x82',
929 None),
930 # 3.31 Non-plain text character U+FFFD.
931 ('\xef\xbf\xbd',
932 None),
933 # 3.32 Ideographic description character U+2FF5.
934 ('\xe2\xbf\xb5',
935 None),
936 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000937 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000938 '\xcc\x81'),
939 # 3.34 Left-to-right mark U+200E.
940 ('\xe2\x80\x8e',
941 None),
942 # 3.35 Deprecated U+202A.
943 ('\xe2\x80\xaa',
944 None),
945 # 3.36 Language tagging character U+E0001.
946 ('\xf3\xa0\x80\x81',
947 None),
948 # 3.37 Language tagging character U+E0042.
949 ('\xf3\xa0\x81\x82',
950 None),
951 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
952 ('foo\xd6\xbebar',
953 None),
954 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
955 ('foo\xef\xb5\x90bar',
956 None),
957 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
958 ('foo\xef\xb9\xb6bar',
959 'foo \xd9\x8ebar'),
960 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
961 ('\xd8\xa71',
962 None),
963 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
964 ('\xd8\xa71\xd8\xa8',
965 '\xd8\xa71\xd8\xa8'),
966 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000967 # Skip this test as we allow unassigned
968 #('\xf3\xa0\x80\x82',
969 # None),
970 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000971 # 3.44 Larger test (shrinking).
972 # Original test case reads \xc3\xdf
973 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
974 '\xaa\xce\xb0\xe2\x80\x80',
975 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
976 # 3.45 Larger test (expanding).
977 # Original test case reads \xc3\x9f
978 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
979 '\x80',
980 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
981 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
982 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
983 ]
984
985
986class NameprepTest(unittest.TestCase):
987 def test_nameprep(self):
988 from encodings.idna import nameprep
989 for pos, (orig, prepped) in enumerate(nameprep_tests):
990 if orig is None:
991 # Skipped
992 continue
993 # The Unicode strings are given in UTF-8
994 orig = unicode(orig, "utf-8")
995 if prepped is None:
996 # Input contains prohibited characters
997 self.assertRaises(UnicodeError, nameprep, orig)
998 else:
999 prepped = unicode(prepped, "utf-8")
1000 try:
1001 self.assertEquals(nameprep(orig), prepped)
1002 except Exception,e:
1003 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1004
Walter Dörwald78a0be62006-04-14 18:25:39 +00001005class IDNACodecTest(unittest.TestCase):
1006 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001007 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +00001008 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
1009 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1010 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
1011
1012 def test_builtin_encode(self):
1013 self.assertEquals(u"python.org".encode("idna"), "python.org")
1014 self.assertEquals("python.org.".encode("idna"), "python.org.")
1015 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1016 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001017
Martin v. Löwis8b595142005-08-25 11:03:38 +00001018 def test_stream(self):
1019 import StringIO
1020 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1021 r.read(3)
1022 self.assertEquals(r.read(), u"")
1023
Walter Dörwald78a0be62006-04-14 18:25:39 +00001024 def test_incremental_decode(self):
1025 self.assertEquals(
1026 "".join(codecs.iterdecode("python.org", "idna")),
1027 u"python.org"
1028 )
1029 self.assertEquals(
1030 "".join(codecs.iterdecode("python.org.", "idna")),
1031 u"python.org."
1032 )
1033 self.assertEquals(
1034 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1035 u"pyth\xf6n.org."
1036 )
1037 self.assertEquals(
1038 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1039 u"pyth\xf6n.org."
1040 )
1041
1042 decoder = codecs.getincrementaldecoder("idna")()
1043 self.assertEquals(decoder.decode("xn--xam", ), u"")
1044 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1045 self.assertEquals(decoder.decode(u"rg"), u"")
1046 self.assertEquals(decoder.decode(u"", True), u"org")
1047
1048 decoder.reset()
1049 self.assertEquals(decoder.decode("xn--xam", ), u"")
1050 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1051 self.assertEquals(decoder.decode("rg."), u"org.")
1052 self.assertEquals(decoder.decode("", True), u"")
1053
1054 def test_incremental_encode(self):
1055 self.assertEquals(
1056 "".join(codecs.iterencode(u"python.org", "idna")),
1057 "python.org"
1058 )
1059 self.assertEquals(
1060 "".join(codecs.iterencode(u"python.org.", "idna")),
1061 "python.org."
1062 )
1063 self.assertEquals(
1064 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1065 "xn--pythn-mua.org."
1066 )
1067 self.assertEquals(
1068 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1069 "xn--pythn-mua.org."
1070 )
1071
1072 encoder = codecs.getincrementalencoder("idna")()
1073 self.assertEquals(encoder.encode(u"\xe4x"), "")
1074 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1075 self.assertEquals(encoder.encode(u"", True), "org")
1076
1077 encoder.reset()
1078 self.assertEquals(encoder.encode(u"\xe4x"), "")
1079 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1080 self.assertEquals(encoder.encode(u"", True), "")
1081
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001082class CodecsModuleTest(unittest.TestCase):
1083
1084 def test_decode(self):
1085 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1086 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001087 self.assertRaises(TypeError, codecs.decode)
1088 self.assertEquals(codecs.decode('abc'), u'abc')
1089 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1090
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001091 def test_encode(self):
1092 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1093 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001094 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001095 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001096 self.assertEquals(codecs.encode(u'abc'), 'abc')
1097 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1098
1099 def test_register(self):
1100 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001101 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001102
1103 def test_lookup(self):
1104 self.assertRaises(TypeError, codecs.lookup)
1105 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001106 self.assertRaises(LookupError, codecs.lookup, " ")
1107
1108 def test_getencoder(self):
1109 self.assertRaises(TypeError, codecs.getencoder)
1110 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1111
1112 def test_getdecoder(self):
1113 self.assertRaises(TypeError, codecs.getdecoder)
1114 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1115
1116 def test_getreader(self):
1117 self.assertRaises(TypeError, codecs.getreader)
1118 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1119
1120 def test_getwriter(self):
1121 self.assertRaises(TypeError, codecs.getwriter)
1122 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001123
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001124class StreamReaderTest(unittest.TestCase):
1125
1126 def setUp(self):
1127 self.reader = codecs.getreader('utf-8')
1128 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1129
1130 def test_readlines(self):
1131 f = self.reader(self.stream)
1132 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1133
Georg Brandl8f99f812006-10-29 08:39:22 +00001134class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001135
Georg Brandl8f99f812006-10-29 08:39:22 +00001136 def test_basic(self):
1137 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001138 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1139 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001140
1141 f = StringIO.StringIO()
1142 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1143 ef.write('\xc3\xbc')
1144 self.assertEquals(f.getvalue(), '\xfc')
1145
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001146class Str2StrTest(unittest.TestCase):
1147
1148 def test_read(self):
1149 sin = "\x80".encode("base64_codec")
1150 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1151 sout = reader.read()
1152 self.assertEqual(sout, "\x80")
1153 self.assert_(isinstance(sout, str))
1154
1155 def test_readline(self):
1156 sin = "\x80".encode("base64_codec")
1157 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1158 sout = reader.readline()
1159 self.assertEqual(sout, "\x80")
1160 self.assert_(isinstance(sout, str))
1161
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001162all_unicode_encodings = [
1163 "ascii",
1164 "base64_codec",
1165 "big5",
1166 "big5hkscs",
1167 "charmap",
1168 "cp037",
1169 "cp1006",
1170 "cp1026",
1171 "cp1140",
1172 "cp1250",
1173 "cp1251",
1174 "cp1252",
1175 "cp1253",
1176 "cp1254",
1177 "cp1255",
1178 "cp1256",
1179 "cp1257",
1180 "cp1258",
1181 "cp424",
1182 "cp437",
1183 "cp500",
1184 "cp737",
1185 "cp775",
1186 "cp850",
1187 "cp852",
1188 "cp855",
1189 "cp856",
1190 "cp857",
1191 "cp860",
1192 "cp861",
1193 "cp862",
1194 "cp863",
1195 "cp864",
1196 "cp865",
1197 "cp866",
1198 "cp869",
1199 "cp874",
1200 "cp875",
1201 "cp932",
1202 "cp949",
1203 "cp950",
1204 "euc_jis_2004",
1205 "euc_jisx0213",
1206 "euc_jp",
1207 "euc_kr",
1208 "gb18030",
1209 "gb2312",
1210 "gbk",
1211 "hex_codec",
1212 "hp_roman8",
1213 "hz",
1214 "idna",
1215 "iso2022_jp",
1216 "iso2022_jp_1",
1217 "iso2022_jp_2",
1218 "iso2022_jp_2004",
1219 "iso2022_jp_3",
1220 "iso2022_jp_ext",
1221 "iso2022_kr",
1222 "iso8859_1",
1223 "iso8859_10",
1224 "iso8859_11",
1225 "iso8859_13",
1226 "iso8859_14",
1227 "iso8859_15",
1228 "iso8859_16",
1229 "iso8859_2",
1230 "iso8859_3",
1231 "iso8859_4",
1232 "iso8859_5",
1233 "iso8859_6",
1234 "iso8859_7",
1235 "iso8859_8",
1236 "iso8859_9",
1237 "johab",
1238 "koi8_r",
1239 "koi8_u",
1240 "latin_1",
1241 "mac_cyrillic",
1242 "mac_greek",
1243 "mac_iceland",
1244 "mac_latin2",
1245 "mac_roman",
1246 "mac_turkish",
1247 "palmos",
1248 "ptcp154",
1249 "punycode",
1250 "raw_unicode_escape",
1251 "rot_13",
1252 "shift_jis",
1253 "shift_jis_2004",
1254 "shift_jisx0213",
1255 "tis_620",
1256 "unicode_escape",
1257 "unicode_internal",
1258 "utf_16",
1259 "utf_16_be",
1260 "utf_16_le",
1261 "utf_7",
1262 "utf_8",
1263]
1264
1265if hasattr(codecs, "mbcs_encode"):
1266 all_unicode_encodings.append("mbcs")
1267
1268# The following encodings work only with str, not unicode
1269all_string_encodings = [
1270 "quopri_codec",
1271 "string_escape",
1272 "uu_codec",
1273]
1274
1275# The following encoding is not tested, because it's not supposed
1276# to work:
1277# "undefined"
1278
1279# The following encodings don't work in stateful mode
1280broken_unicode_with_streams = [
1281 "base64_codec",
1282 "hex_codec",
1283 "punycode",
1284 "unicode_internal"
1285]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001286broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001287
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001288# The following encodings only support "strict" mode
1289only_strict_mode = [
1290 "idna",
1291 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001292 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001293]
1294
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001295try:
1296 import bz2
1297except ImportError:
1298 pass
1299else:
1300 all_unicode_encodings.append("bz2_codec")
1301 broken_unicode_with_streams.append("bz2_codec")
1302
1303try:
1304 import zlib
1305except ImportError:
1306 pass
1307else:
1308 all_unicode_encodings.append("zlib_codec")
1309 broken_unicode_with_streams.append("zlib_codec")
1310
1311class BasicUnicodeTest(unittest.TestCase):
1312 def test_basics(self):
1313 s = u"abc123" # all codecs should be able to encode these
1314 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001315 name = codecs.lookup(encoding).name
1316 if encoding.endswith("_codec"):
1317 name += "_codec"
1318 elif encoding == "latin_1":
1319 name = "latin_1"
1320 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001321 (bytes, size) = codecs.getencoder(encoding)(s)
1322 if encoding != "unicode_internal":
1323 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1324 (chars, size) = codecs.getdecoder(encoding)(bytes)
1325 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1326
1327 if encoding not in broken_unicode_with_streams:
1328 # check stream reader/writer
1329 q = Queue()
1330 writer = codecs.getwriter(encoding)(q)
1331 encodedresult = ""
1332 for c in s:
1333 writer.write(c)
1334 encodedresult += q.read()
1335 q = Queue()
1336 reader = codecs.getreader(encoding)(q)
1337 decodedresult = u""
1338 for c in encodedresult:
1339 q.write(c)
1340 decodedresult += reader.read()
1341 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1342
Georg Brandl2c9838e2006-10-29 14:39:09 +00001343 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001344 # check incremental decoder/encoder (fetched via the Python
1345 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001346 try:
1347 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001348 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001349 except LookupError: # no IncrementalEncoder
1350 pass
1351 else:
1352 # check incremental decoder/encoder
1353 encodedresult = ""
1354 for c in s:
1355 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001356 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001357 decoder = codecs.getincrementaldecoder(encoding)()
1358 decodedresult = u""
1359 for c in encodedresult:
1360 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001361 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001362 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1363
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001364 # check C API
1365 encodedresult = ""
1366 for c in s:
1367 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001368 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001369 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1370 decodedresult = u""
1371 for c in encodedresult:
1372 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001373 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001374 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1375
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001376 # check iterencode()/iterdecode()
1377 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1378 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1379
1380 # check iterencode()/iterdecode() with empty string
1381 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1382 self.assertEqual(result, u"")
1383
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001384 if encoding not in only_strict_mode:
1385 # check incremental decoder/encoder with errors argument
1386 try:
1387 encoder = codecs.getincrementalencoder(encoding)("ignore")
1388 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1389 except LookupError: # no IncrementalEncoder
1390 pass
1391 else:
1392 encodedresult = "".join(encoder.encode(c) for c in s)
1393 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1394 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1395 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001396
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001397 encodedresult = "".join(cencoder.encode(c) for c in s)
1398 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1399 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1400 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1401
Walter Dörwald729c31f2005-03-14 19:06:30 +00001402 def test_seek(self):
1403 # all codecs should be able to encode these
1404 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1405 for encoding in all_unicode_encodings:
1406 if encoding == "idna": # FIXME: See SF bug #1163178
1407 continue
1408 if encoding in broken_unicode_with_streams:
1409 continue
1410 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1411 for t in xrange(5):
1412 # Test that calling seek resets the internal codec state and buffers
1413 reader.seek(0, 0)
1414 line = reader.readline()
1415 self.assertEqual(s[:len(line)], line)
1416
Walter Dörwalde22d3392005-11-17 08:52:34 +00001417 def test_bad_decode_args(self):
1418 for encoding in all_unicode_encodings:
1419 decoder = codecs.getdecoder(encoding)
1420 self.assertRaises(TypeError, decoder)
1421 if encoding not in ("idna", "punycode"):
1422 self.assertRaises(TypeError, decoder, 42)
1423
1424 def test_bad_encode_args(self):
1425 for encoding in all_unicode_encodings:
1426 encoder = codecs.getencoder(encoding)
1427 self.assertRaises(TypeError, encoder)
1428
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001429 def test_encoding_map_type_initialized(self):
1430 from encodings import cp1140
1431 # This used to crash, we are only verifying there's no crash.
1432 table_type = type(cp1140.encoding_table)
1433 self.assertEqual(table_type, table_type)
1434
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001435class BasicStrTest(unittest.TestCase):
1436 def test_basics(self):
1437 s = "abc123"
1438 for encoding in all_string_encodings:
1439 (bytes, size) = codecs.getencoder(encoding)(s)
1440 self.assertEqual(size, len(s))
1441 (chars, size) = codecs.getdecoder(encoding)(bytes)
1442 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1443
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001444class CharmapTest(unittest.TestCase):
1445 def test_decode_with_string_map(self):
1446 self.assertEquals(
1447 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1448 (u"abc", 3)
1449 )
1450
1451 self.assertEquals(
1452 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1453 (u"ab\ufffd", 3)
1454 )
1455
1456 self.assertEquals(
1457 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1458 (u"ab\ufffd", 3)
1459 )
1460
1461 self.assertEquals(
1462 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1463 (u"ab", 3)
1464 )
1465
1466 self.assertEquals(
1467 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1468 (u"ab", 3)
1469 )
1470
1471 allbytes = "".join(chr(i) for i in xrange(256))
1472 self.assertEquals(
1473 codecs.charmap_decode(allbytes, "ignore", u""),
1474 (u"", len(allbytes))
1475 )
1476
Georg Brandl8f99f812006-10-29 08:39:22 +00001477class WithStmtTest(unittest.TestCase):
1478 def test_encodedfile(self):
1479 f = StringIO.StringIO("\xc3\xbc")
1480 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1481 self.assertEquals(ef.read(), "\xfc")
1482
1483 def test_streamreaderwriter(self):
1484 f = StringIO.StringIO("\xc3\xbc")
1485 info = codecs.lookup("utf-8")
1486 with codecs.StreamReaderWriter(f, info.streamreader,
1487 info.streamwriter, 'strict') as srw:
1488 self.assertEquals(srw.read(), u"\xfc")
1489
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001490
Fred Drake2e2be372001-09-20 21:33:42 +00001491def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001492 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001493 UTF32Test,
1494 UTF32LETest,
1495 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001496 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001497 UTF16LETest,
1498 UTF16BETest,
1499 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001500 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001501 UTF7Test,
1502 UTF16ExTest,
1503 ReadBufferTest,
1504 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001505 EscapeDecodeTest,
1506 RecodingTest,
1507 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001508 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001509 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001510 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001511 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001512 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001513 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001514 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001515 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001516 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001517 CharmapTest,
1518 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001519 )
Fred Drake2e2be372001-09-20 21:33:42 +00001520
1521
1522if __name__ == "__main__":
1523 test_main()