blob: 57420ffb3b800e8522354f0c825cd020709f76f6 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assert_(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
Georg Brandlf7a09be2009-09-17 11:33:31 +0000308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
313
Walter Dörwald6e390802007-08-17 16:41:28 +0000314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
317
318class UTF32LETest(ReadTest):
319 encoding = "utf-32-le"
320
321 def test_partial(self):
322 self.check_partial(
323 u"\x00\xff\u0100\uffff",
324 [
325 u"",
326 u"",
327 u"",
328 u"\x00",
329 u"\x00",
330 u"\x00",
331 u"\x00",
332 u"\x00\xff",
333 u"\x00\xff",
334 u"\x00\xff",
335 u"\x00\xff",
336 u"\x00\xff\u0100",
337 u"\x00\xff\u0100",
338 u"\x00\xff\u0100",
339 u"\x00\xff\u0100",
340 u"\x00\xff\u0100\uffff",
341 ]
342 )
343
344 def test_simple(self):
345 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
346
347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
349 "\xff", "strict", True)
350
351class UTF32BETest(ReadTest):
352 encoding = "utf-32-be"
353
354 def test_partial(self):
355 self.check_partial(
356 u"\x00\xff\u0100\uffff",
357 [
358 u"",
359 u"",
360 u"",
361 u"\x00",
362 u"\x00",
363 u"\x00",
364 u"\x00",
365 u"\x00\xff",
366 u"\x00\xff",
367 u"\x00\xff",
368 u"\x00\xff",
369 u"\x00\xff\u0100",
370 u"\x00\xff\u0100",
371 u"\x00\xff\u0100",
372 u"\x00\xff\u0100",
373 u"\x00\xff\u0100\uffff",
374 ]
375 )
376
377 def test_simple(self):
378 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
379
380 def test_errors(self):
381 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
382 "\xff", "strict", True)
383
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000384class UTF16Test(ReadTest):
385 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386
387 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
388 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
389
390 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000391 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000392 # encode some stream
393 s = StringIO.StringIO()
394 f = writer(s)
395 f.write(u"spam")
396 f.write(u"spam")
397 d = s.getvalue()
398 # check whether there is exactly one BOM in it
399 self.assert_(d == self.spamle or d == self.spambe)
400 # try to read it back
401 s = StringIO.StringIO(d)
402 f = reader(s)
403 self.assertEquals(f.read(), u"spamspam")
404
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000405 def test_badbom(self):
406 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000407 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000408 self.assertRaises(UnicodeError, f.read)
409
410 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000412 self.assertRaises(UnicodeError, f.read)
413
Walter Dörwald69652032004-09-07 20:24:22 +0000414 def test_partial(self):
415 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000416 u"\x00\xff\u0100\uffff",
417 [
418 u"", # first byte of BOM read
419 u"", # second byte of BOM read => byteorder known
420 u"",
421 u"\x00",
422 u"\x00",
423 u"\x00\xff",
424 u"\x00\xff",
425 u"\x00\xff\u0100",
426 u"\x00\xff\u0100",
427 u"\x00\xff\u0100\uffff",
428 ]
429 )
430
Georg Brandlf7a09be2009-09-17 11:33:31 +0000431 def test_handlers(self):
432 self.assertEqual((u'\ufffd', 1),
433 codecs.utf_16_decode('\x01', 'replace', True))
434 self.assertEqual((u'', 1),
435 codecs.utf_16_decode('\x01', 'ignore', True))
436
Walter Dörwalde22d3392005-11-17 08:52:34 +0000437 def test_errors(self):
438 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
439
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000440class UTF16LETest(ReadTest):
441 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000442
443 def test_partial(self):
444 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000445 u"\x00\xff\u0100\uffff",
446 [
447 u"",
448 u"\x00",
449 u"\x00",
450 u"\x00\xff",
451 u"\x00\xff",
452 u"\x00\xff\u0100",
453 u"\x00\xff\u0100",
454 u"\x00\xff\u0100\uffff",
455 ]
456 )
457
Walter Dörwalde22d3392005-11-17 08:52:34 +0000458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
460
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000461class UTF16BETest(ReadTest):
462 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000463
464 def test_partial(self):
465 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000466 u"\x00\xff\u0100\uffff",
467 [
468 u"",
469 u"\x00",
470 u"\x00",
471 u"\x00\xff",
472 u"\x00\xff",
473 u"\x00\xff\u0100",
474 u"\x00\xff\u0100",
475 u"\x00\xff\u0100\uffff",
476 ]
477 )
478
Walter Dörwalde22d3392005-11-17 08:52:34 +0000479 def test_errors(self):
480 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
481
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000482class UTF8Test(ReadTest):
483 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000484
485 def test_partial(self):
486 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000487 u"\x00\xff\u07ff\u0800\uffff",
488 [
489 u"\x00",
490 u"\x00",
491 u"\x00\xff",
492 u"\x00\xff",
493 u"\x00\xff\u07ff",
494 u"\x00\xff\u07ff",
495 u"\x00\xff\u07ff",
496 u"\x00\xff\u07ff\u0800",
497 u"\x00\xff\u07ff\u0800",
498 u"\x00\xff\u07ff\u0800",
499 u"\x00\xff\u07ff\u0800\uffff",
500 ]
501 )
502
Walter Dörwalde22d3392005-11-17 08:52:34 +0000503class UTF7Test(ReadTest):
504 encoding = "utf-7"
505
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000506 def test_partial(self):
507 self.check_partial(
508 u"a+-b",
509 [
510 u"a",
511 u"a",
512 u"a+",
513 u"a+-",
514 u"a+-b",
515 ]
516 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517
518class UTF16ExTest(unittest.TestCase):
519
520 def test_errors(self):
521 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
522
523 def test_bad_args(self):
524 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
525
526class ReadBufferTest(unittest.TestCase):
527
528 def test_array(self):
529 import array
530 self.assertEqual(
531 codecs.readbuffer_encode(array.array("c", "spam")),
532 ("spam", 4)
533 )
534
535 def test_empty(self):
536 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
537
538 def test_bad_args(self):
539 self.assertRaises(TypeError, codecs.readbuffer_encode)
540 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
541
542class CharBufferTest(unittest.TestCase):
543
544 def test_string(self):
545 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
546
547 def test_empty(self):
548 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
549
550 def test_bad_args(self):
551 self.assertRaises(TypeError, codecs.charbuffer_encode)
552 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
553
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000554class UTF8SigTest(ReadTest):
555 encoding = "utf-8-sig"
556
557 def test_partial(self):
558 self.check_partial(
559 u"\ufeff\x00\xff\u07ff\u0800\uffff",
560 [
561 u"",
562 u"",
563 u"", # First BOM has been read and skipped
564 u"",
565 u"",
566 u"\ufeff", # Second BOM has been read and emitted
567 u"\ufeff\x00", # "\x00" read and emitted
568 u"\ufeff\x00", # First byte of encoded u"\xff" read
569 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
570 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
571 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
572 u"\ufeff\x00\xff\u07ff",
573 u"\ufeff\x00\xff\u07ff",
574 u"\ufeff\x00\xff\u07ff\u0800",
575 u"\ufeff\x00\xff\u07ff\u0800",
576 u"\ufeff\x00\xff\u07ff\u0800",
577 u"\ufeff\x00\xff\u07ff\u0800\uffff",
578 ]
579 )
580
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000581 def test_bug1601501(self):
582 # SF bug #1601501: check that the codec works with a buffer
583 unicode("\xef\xbb\xbf", "utf-8-sig")
584
Walter Dörwald42348272007-04-12 10:35:00 +0000585 def test_bom(self):
586 d = codecs.getincrementaldecoder("utf-8-sig")()
587 s = u"spam"
588 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
589
Walter Dörwald183744d2007-11-19 12:41:10 +0000590 def test_stream_bom(self):
591 unistring = u"ABC\u00A1\u2200XYZ"
592 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
593
594 reader = codecs.getreader("utf-8-sig")
595 for sizehint in [None] + range(1, 11) + \
596 [64, 128, 256, 512, 1024]:
597 istream = reader(StringIO.StringIO(bytestring))
598 ostream = StringIO.StringIO()
599 while 1:
600 if sizehint is not None:
601 data = istream.read(sizehint)
602 else:
603 data = istream.read()
604
605 if not data:
606 break
607 ostream.write(data)
608
609 got = ostream.getvalue()
610 self.assertEqual(got, unistring)
611
612 def test_stream_bare(self):
613 unistring = u"ABC\u00A1\u2200XYZ"
614 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
615
616 reader = codecs.getreader("utf-8-sig")
617 for sizehint in [None] + range(1, 11) + \
618 [64, 128, 256, 512, 1024]:
619 istream = reader(StringIO.StringIO(bytestring))
620 ostream = StringIO.StringIO()
621 while 1:
622 if sizehint is not None:
623 data = istream.read(sizehint)
624 else:
625 data = istream.read()
626
627 if not data:
628 break
629 ostream.write(data)
630
631 got = ostream.getvalue()
632 self.assertEqual(got, unistring)
633
Walter Dörwald8709a422002-09-03 13:53:40 +0000634class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000635 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000636 self.assertEquals(codecs.escape_decode(""), ("", 0))
637
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000638class RecodingTest(unittest.TestCase):
639 def test_recoding(self):
640 f = StringIO.StringIO()
641 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
642 f2.write(u"a")
643 f2.close()
644 # Python used to crash on this at exit because of a refcount
645 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000646
Martin v. Löwis2548c732003-04-18 10:39:54 +0000647# From RFC 3492
648punycode_testcases = [
649 # A Arabic (Egyptian):
650 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
651 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
652 "egbpdaj6bu4bxfgehfvwxn"),
653 # B Chinese (simplified):
654 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
655 "ihqwcrb4cv8a8dqg056pqjye"),
656 # C Chinese (traditional):
657 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
658 "ihqwctvzc91f659drss3x8bo0yb"),
659 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
660 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
661 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
662 u"\u0065\u0073\u006B\u0079",
663 "Proprostnemluvesky-uyb24dma41a"),
664 # E Hebrew:
665 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
666 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
667 u"\u05D1\u05E8\u05D9\u05EA",
668 "4dbcagdahymbxekheh6e0a7fei0b"),
669 # F Hindi (Devanagari):
670 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
671 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
672 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
673 u"\u0939\u0948\u0902",
674 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
675
676 #(G) Japanese (kanji and hiragana):
677 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
678 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
679 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
680
681 # (H) Korean (Hangul syllables):
682 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
683 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
684 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
685 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
686 "psd879ccm6fea98c"),
687
688 # (I) Russian (Cyrillic):
689 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
690 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
691 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
692 u"\u0438",
693 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
694
695 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
696 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
697 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
698 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
699 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
700 u"\u0061\u00F1\u006F\u006C",
701 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
702
703 # (K) Vietnamese:
704 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
705 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
706 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
707 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
708 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
709 u"\u0056\u0069\u1EC7\u0074",
710 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
711
Martin v. Löwis2548c732003-04-18 10:39:54 +0000712 #(L) 3<nen>B<gumi><kinpachi><sensei>
713 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
714 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000715
Martin v. Löwis2548c732003-04-18 10:39:54 +0000716 # (M) <amuro><namie>-with-SUPER-MONKEYS
717 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
718 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
719 u"\u004F\u004E\u004B\u0045\u0059\u0053",
720 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
721
722 # (N) Hello-Another-Way-<sorezore><no><basho>
723 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
724 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
725 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
726 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
727
728 # (O) <hitotsu><yane><no><shita>2
729 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
730 "2-u9tlzr9756bt3uc0v"),
731
732 # (P) Maji<de>Koi<suru>5<byou><mae>
733 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
734 u"\u308B\u0035\u79D2\u524D",
735 "MajiKoi5-783gue6qz075azm5e"),
736
737 # (Q) <pafii>de<runba>
738 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
739 "de-jg4avhby1noc0d"),
740
741 # (R) <sono><supiido><de>
742 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
743 "d9juau41awczczp"),
744
745 # (S) -> $1.00 <-
746 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
747 u"\u003C\u002D",
748 "-> $1.00 <--")
749 ]
750
751for i in punycode_testcases:
752 if len(i)!=2:
753 print repr(i)
754
755class PunycodeTest(unittest.TestCase):
756 def test_encode(self):
757 for uni, puny in punycode_testcases:
758 # Need to convert both strings to lower case, since
759 # some of the extended encodings use upper case, but our
760 # code produces only lower case. Converting just puny to
761 # lower is also insufficient, since some of the input characters
762 # are upper case.
763 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
764
765 def test_decode(self):
766 for uni, puny in punycode_testcases:
767 self.assertEquals(uni, puny.decode("punycode"))
768
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000769class UnicodeInternalTest(unittest.TestCase):
770 def test_bug1251300(self):
771 # Decoding with unicode_internal used to not correctly handle "code
772 # points" above 0x10ffff on UCS-4 builds.
773 if sys.maxunicode > 0xffff:
774 ok = [
775 ("\x00\x10\xff\xff", u"\U0010ffff"),
776 ("\x00\x00\x01\x01", u"\U00000101"),
777 ("", u""),
778 ]
779 not_ok = [
780 "\x7f\xff\xff\xff",
781 "\x80\x00\x00\x00",
782 "\x81\x00\x00\x00",
783 "\x00",
784 "\x00\x00\x00\x00\x00",
785 ]
786 for internal, uni in ok:
787 if sys.byteorder == "little":
788 internal = "".join(reversed(internal))
789 self.assertEquals(uni, internal.decode("unicode_internal"))
790 for internal in not_ok:
791 if sys.byteorder == "little":
792 internal = "".join(reversed(internal))
793 self.assertRaises(UnicodeDecodeError, internal.decode,
794 "unicode_internal")
795
796 def test_decode_error_attributes(self):
797 if sys.maxunicode > 0xffff:
798 try:
799 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
800 except UnicodeDecodeError, ex:
801 self.assertEquals("unicode_internal", ex.encoding)
802 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
803 self.assertEquals(4, ex.start)
804 self.assertEquals(8, ex.end)
805 else:
806 self.fail()
807
808 def test_decode_callback(self):
809 if sys.maxunicode > 0xffff:
810 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
811 decoder = codecs.getdecoder("unicode_internal")
812 ab = u"ab".encode("unicode_internal")
813 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
814 "UnicodeInternalTest")
815 self.assertEquals((u"ab", 12), ignored)
816
Martin v. Löwis2548c732003-04-18 10:39:54 +0000817# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
818nameprep_tests = [
819 # 3.1 Map to nothing.
820 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
821 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
822 '\xb8\x8f\xef\xbb\xbf',
823 'foobarbaz'),
824 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
825 ('CAFE',
826 'cafe'),
827 # 3.3 Case folding 8bit U+00DF (german sharp s).
828 # The original test case is bogus; it says \xc3\xdf
829 ('\xc3\x9f',
830 'ss'),
831 # 3.4 Case folding U+0130 (turkish capital I with dot).
832 ('\xc4\xb0',
833 'i\xcc\x87'),
834 # 3.5 Case folding multibyte U+0143 U+037A.
835 ('\xc5\x83\xcd\xba',
836 '\xc5\x84 \xce\xb9'),
837 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
838 # XXX: skip this as it fails in UCS-2 mode
839 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
840 # 'telc\xe2\x88\x95kg\xcf\x83'),
841 (None, None),
842 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
843 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
844 '\xc7\xb0 a'),
845 # 3.8 Case folding U+1FB7 and normalization.
846 ('\xe1\xbe\xb7',
847 '\xe1\xbe\xb6\xce\xb9'),
848 # 3.9 Self-reverting case folding U+01F0 and normalization.
849 # The original test case is bogus, it says `\xc7\xf0'
850 ('\xc7\xb0',
851 '\xc7\xb0'),
852 # 3.10 Self-reverting case folding U+0390 and normalization.
853 ('\xce\x90',
854 '\xce\x90'),
855 # 3.11 Self-reverting case folding U+03B0 and normalization.
856 ('\xce\xb0',
857 '\xce\xb0'),
858 # 3.12 Self-reverting case folding U+1E96 and normalization.
859 ('\xe1\xba\x96',
860 '\xe1\xba\x96'),
861 # 3.13 Self-reverting case folding U+1F56 and normalization.
862 ('\xe1\xbd\x96',
863 '\xe1\xbd\x96'),
864 # 3.14 ASCII space character U+0020.
865 (' ',
866 ' '),
867 # 3.15 Non-ASCII 8bit space character U+00A0.
868 ('\xc2\xa0',
869 ' '),
870 # 3.16 Non-ASCII multibyte space character U+1680.
871 ('\xe1\x9a\x80',
872 None),
873 # 3.17 Non-ASCII multibyte space character U+2000.
874 ('\xe2\x80\x80',
875 ' '),
876 # 3.18 Zero Width Space U+200b.
877 ('\xe2\x80\x8b',
878 ''),
879 # 3.19 Non-ASCII multibyte space character U+3000.
880 ('\xe3\x80\x80',
881 ' '),
882 # 3.20 ASCII control characters U+0010 U+007F.
883 ('\x10\x7f',
884 '\x10\x7f'),
885 # 3.21 Non-ASCII 8bit control character U+0085.
886 ('\xc2\x85',
887 None),
888 # 3.22 Non-ASCII multibyte control character U+180E.
889 ('\xe1\xa0\x8e',
890 None),
891 # 3.23 Zero Width No-Break Space U+FEFF.
892 ('\xef\xbb\xbf',
893 ''),
894 # 3.24 Non-ASCII control character U+1D175.
895 ('\xf0\x9d\x85\xb5',
896 None),
897 # 3.25 Plane 0 private use character U+F123.
898 ('\xef\x84\xa3',
899 None),
900 # 3.26 Plane 15 private use character U+F1234.
901 ('\xf3\xb1\x88\xb4',
902 None),
903 # 3.27 Plane 16 private use character U+10F234.
904 ('\xf4\x8f\x88\xb4',
905 None),
906 # 3.28 Non-character code point U+8FFFE.
907 ('\xf2\x8f\xbf\xbe',
908 None),
909 # 3.29 Non-character code point U+10FFFF.
910 ('\xf4\x8f\xbf\xbf',
911 None),
912 # 3.30 Surrogate code U+DF42.
913 ('\xed\xbd\x82',
914 None),
915 # 3.31 Non-plain text character U+FFFD.
916 ('\xef\xbf\xbd',
917 None),
918 # 3.32 Ideographic description character U+2FF5.
919 ('\xe2\xbf\xb5',
920 None),
921 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000922 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000923 '\xcc\x81'),
924 # 3.34 Left-to-right mark U+200E.
925 ('\xe2\x80\x8e',
926 None),
927 # 3.35 Deprecated U+202A.
928 ('\xe2\x80\xaa',
929 None),
930 # 3.36 Language tagging character U+E0001.
931 ('\xf3\xa0\x80\x81',
932 None),
933 # 3.37 Language tagging character U+E0042.
934 ('\xf3\xa0\x81\x82',
935 None),
936 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
937 ('foo\xd6\xbebar',
938 None),
939 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
940 ('foo\xef\xb5\x90bar',
941 None),
942 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
943 ('foo\xef\xb9\xb6bar',
944 'foo \xd9\x8ebar'),
945 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
946 ('\xd8\xa71',
947 None),
948 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
949 ('\xd8\xa71\xd8\xa8',
950 '\xd8\xa71\xd8\xa8'),
951 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000952 # Skip this test as we allow unassigned
953 #('\xf3\xa0\x80\x82',
954 # None),
955 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000956 # 3.44 Larger test (shrinking).
957 # Original test case reads \xc3\xdf
958 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
959 '\xaa\xce\xb0\xe2\x80\x80',
960 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
961 # 3.45 Larger test (expanding).
962 # Original test case reads \xc3\x9f
963 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
964 '\x80',
965 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
966 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
967 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
968 ]
969
970
971class NameprepTest(unittest.TestCase):
972 def test_nameprep(self):
973 from encodings.idna import nameprep
974 for pos, (orig, prepped) in enumerate(nameprep_tests):
975 if orig is None:
976 # Skipped
977 continue
978 # The Unicode strings are given in UTF-8
979 orig = unicode(orig, "utf-8")
980 if prepped is None:
981 # Input contains prohibited characters
982 self.assertRaises(UnicodeError, nameprep, orig)
983 else:
984 prepped = unicode(prepped, "utf-8")
985 try:
986 self.assertEquals(nameprep(orig), prepped)
987 except Exception,e:
988 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
989
Walter Dörwald78a0be62006-04-14 18:25:39 +0000990class IDNACodecTest(unittest.TestCase):
991 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000992 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000993 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
994 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
995 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
996
997 def test_builtin_encode(self):
998 self.assertEquals(u"python.org".encode("idna"), "python.org")
999 self.assertEquals("python.org.".encode("idna"), "python.org.")
1000 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1001 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001002
Martin v. Löwis8b595142005-08-25 11:03:38 +00001003 def test_stream(self):
1004 import StringIO
1005 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1006 r.read(3)
1007 self.assertEquals(r.read(), u"")
1008
Walter Dörwald78a0be62006-04-14 18:25:39 +00001009 def test_incremental_decode(self):
1010 self.assertEquals(
1011 "".join(codecs.iterdecode("python.org", "idna")),
1012 u"python.org"
1013 )
1014 self.assertEquals(
1015 "".join(codecs.iterdecode("python.org.", "idna")),
1016 u"python.org."
1017 )
1018 self.assertEquals(
1019 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1020 u"pyth\xf6n.org."
1021 )
1022 self.assertEquals(
1023 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1024 u"pyth\xf6n.org."
1025 )
1026
1027 decoder = codecs.getincrementaldecoder("idna")()
1028 self.assertEquals(decoder.decode("xn--xam", ), u"")
1029 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1030 self.assertEquals(decoder.decode(u"rg"), u"")
1031 self.assertEquals(decoder.decode(u"", True), u"org")
1032
1033 decoder.reset()
1034 self.assertEquals(decoder.decode("xn--xam", ), u"")
1035 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1036 self.assertEquals(decoder.decode("rg."), u"org.")
1037 self.assertEquals(decoder.decode("", True), u"")
1038
1039 def test_incremental_encode(self):
1040 self.assertEquals(
1041 "".join(codecs.iterencode(u"python.org", "idna")),
1042 "python.org"
1043 )
1044 self.assertEquals(
1045 "".join(codecs.iterencode(u"python.org.", "idna")),
1046 "python.org."
1047 )
1048 self.assertEquals(
1049 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1050 "xn--pythn-mua.org."
1051 )
1052 self.assertEquals(
1053 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1054 "xn--pythn-mua.org."
1055 )
1056
1057 encoder = codecs.getincrementalencoder("idna")()
1058 self.assertEquals(encoder.encode(u"\xe4x"), "")
1059 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1060 self.assertEquals(encoder.encode(u"", True), "org")
1061
1062 encoder.reset()
1063 self.assertEquals(encoder.encode(u"\xe4x"), "")
1064 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1065 self.assertEquals(encoder.encode(u"", True), "")
1066
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001067class CodecsModuleTest(unittest.TestCase):
1068
1069 def test_decode(self):
1070 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1071 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001072 self.assertRaises(TypeError, codecs.decode)
1073 self.assertEquals(codecs.decode('abc'), u'abc')
1074 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1075
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001076 def test_encode(self):
1077 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1078 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001079 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001080 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001081 self.assertEquals(codecs.encode(u'abc'), 'abc')
1082 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1083
1084 def test_register(self):
1085 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001086 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001087
1088 def test_lookup(self):
1089 self.assertRaises(TypeError, codecs.lookup)
1090 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001091 self.assertRaises(LookupError, codecs.lookup, " ")
1092
1093 def test_getencoder(self):
1094 self.assertRaises(TypeError, codecs.getencoder)
1095 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1096
1097 def test_getdecoder(self):
1098 self.assertRaises(TypeError, codecs.getdecoder)
1099 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1100
1101 def test_getreader(self):
1102 self.assertRaises(TypeError, codecs.getreader)
1103 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1104
1105 def test_getwriter(self):
1106 self.assertRaises(TypeError, codecs.getwriter)
1107 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001108
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001109class StreamReaderTest(unittest.TestCase):
1110
1111 def setUp(self):
1112 self.reader = codecs.getreader('utf-8')
1113 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1114
1115 def test_readlines(self):
1116 f = self.reader(self.stream)
1117 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1118
Georg Brandl8f99f812006-10-29 08:39:22 +00001119class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001120
Georg Brandl8f99f812006-10-29 08:39:22 +00001121 def test_basic(self):
1122 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001123 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1124 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001125
1126 f = StringIO.StringIO()
1127 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1128 ef.write('\xc3\xbc')
1129 self.assertEquals(f.getvalue(), '\xfc')
1130
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001131class Str2StrTest(unittest.TestCase):
1132
1133 def test_read(self):
1134 sin = "\x80".encode("base64_codec")
1135 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1136 sout = reader.read()
1137 self.assertEqual(sout, "\x80")
1138 self.assert_(isinstance(sout, str))
1139
1140 def test_readline(self):
1141 sin = "\x80".encode("base64_codec")
1142 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1143 sout = reader.readline()
1144 self.assertEqual(sout, "\x80")
1145 self.assert_(isinstance(sout, str))
1146
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001147all_unicode_encodings = [
1148 "ascii",
1149 "base64_codec",
1150 "big5",
1151 "big5hkscs",
1152 "charmap",
1153 "cp037",
1154 "cp1006",
1155 "cp1026",
1156 "cp1140",
1157 "cp1250",
1158 "cp1251",
1159 "cp1252",
1160 "cp1253",
1161 "cp1254",
1162 "cp1255",
1163 "cp1256",
1164 "cp1257",
1165 "cp1258",
1166 "cp424",
1167 "cp437",
1168 "cp500",
1169 "cp737",
1170 "cp775",
1171 "cp850",
1172 "cp852",
1173 "cp855",
1174 "cp856",
1175 "cp857",
1176 "cp860",
1177 "cp861",
1178 "cp862",
1179 "cp863",
1180 "cp864",
1181 "cp865",
1182 "cp866",
1183 "cp869",
1184 "cp874",
1185 "cp875",
1186 "cp932",
1187 "cp949",
1188 "cp950",
1189 "euc_jis_2004",
1190 "euc_jisx0213",
1191 "euc_jp",
1192 "euc_kr",
1193 "gb18030",
1194 "gb2312",
1195 "gbk",
1196 "hex_codec",
1197 "hp_roman8",
1198 "hz",
1199 "idna",
1200 "iso2022_jp",
1201 "iso2022_jp_1",
1202 "iso2022_jp_2",
1203 "iso2022_jp_2004",
1204 "iso2022_jp_3",
1205 "iso2022_jp_ext",
1206 "iso2022_kr",
1207 "iso8859_1",
1208 "iso8859_10",
1209 "iso8859_11",
1210 "iso8859_13",
1211 "iso8859_14",
1212 "iso8859_15",
1213 "iso8859_16",
1214 "iso8859_2",
1215 "iso8859_3",
1216 "iso8859_4",
1217 "iso8859_5",
1218 "iso8859_6",
1219 "iso8859_7",
1220 "iso8859_8",
1221 "iso8859_9",
1222 "johab",
1223 "koi8_r",
1224 "koi8_u",
1225 "latin_1",
1226 "mac_cyrillic",
1227 "mac_greek",
1228 "mac_iceland",
1229 "mac_latin2",
1230 "mac_roman",
1231 "mac_turkish",
1232 "palmos",
1233 "ptcp154",
1234 "punycode",
1235 "raw_unicode_escape",
1236 "rot_13",
1237 "shift_jis",
1238 "shift_jis_2004",
1239 "shift_jisx0213",
1240 "tis_620",
1241 "unicode_escape",
1242 "unicode_internal",
1243 "utf_16",
1244 "utf_16_be",
1245 "utf_16_le",
1246 "utf_7",
1247 "utf_8",
1248]
1249
1250if hasattr(codecs, "mbcs_encode"):
1251 all_unicode_encodings.append("mbcs")
1252
1253# The following encodings work only with str, not unicode
1254all_string_encodings = [
1255 "quopri_codec",
1256 "string_escape",
1257 "uu_codec",
1258]
1259
1260# The following encoding is not tested, because it's not supposed
1261# to work:
1262# "undefined"
1263
1264# The following encodings don't work in stateful mode
1265broken_unicode_with_streams = [
1266 "base64_codec",
1267 "hex_codec",
1268 "punycode",
1269 "unicode_internal"
1270]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001271broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001272
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001273# The following encodings only support "strict" mode
1274only_strict_mode = [
1275 "idna",
1276 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001277 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001278]
1279
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001280try:
1281 import bz2
1282except ImportError:
1283 pass
1284else:
1285 all_unicode_encodings.append("bz2_codec")
1286 broken_unicode_with_streams.append("bz2_codec")
1287
1288try:
1289 import zlib
1290except ImportError:
1291 pass
1292else:
1293 all_unicode_encodings.append("zlib_codec")
1294 broken_unicode_with_streams.append("zlib_codec")
1295
1296class BasicUnicodeTest(unittest.TestCase):
1297 def test_basics(self):
1298 s = u"abc123" # all codecs should be able to encode these
1299 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001300 name = codecs.lookup(encoding).name
1301 if encoding.endswith("_codec"):
1302 name += "_codec"
1303 elif encoding == "latin_1":
1304 name = "latin_1"
1305 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001306 (bytes, size) = codecs.getencoder(encoding)(s)
1307 if encoding != "unicode_internal":
1308 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1309 (chars, size) = codecs.getdecoder(encoding)(bytes)
1310 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1311
1312 if encoding not in broken_unicode_with_streams:
1313 # check stream reader/writer
1314 q = Queue()
1315 writer = codecs.getwriter(encoding)(q)
1316 encodedresult = ""
1317 for c in s:
1318 writer.write(c)
1319 encodedresult += q.read()
1320 q = Queue()
1321 reader = codecs.getreader(encoding)(q)
1322 decodedresult = u""
1323 for c in encodedresult:
1324 q.write(c)
1325 decodedresult += reader.read()
1326 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1327
Georg Brandl2c9838e2006-10-29 14:39:09 +00001328 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001329 # check incremental decoder/encoder (fetched via the Python
1330 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001331 try:
1332 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001333 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001334 except LookupError: # no IncrementalEncoder
1335 pass
1336 else:
1337 # check incremental decoder/encoder
1338 encodedresult = ""
1339 for c in s:
1340 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001341 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001342 decoder = codecs.getincrementaldecoder(encoding)()
1343 decodedresult = u""
1344 for c in encodedresult:
1345 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001346 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001347 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1348
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001349 # check C API
1350 encodedresult = ""
1351 for c in s:
1352 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001353 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001354 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1355 decodedresult = u""
1356 for c in encodedresult:
1357 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001358 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001359 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1360
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001361 # check iterencode()/iterdecode()
1362 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1363 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1364
1365 # check iterencode()/iterdecode() with empty string
1366 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1367 self.assertEqual(result, u"")
1368
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001369 if encoding not in only_strict_mode:
1370 # check incremental decoder/encoder with errors argument
1371 try:
1372 encoder = codecs.getincrementalencoder(encoding)("ignore")
1373 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1374 except LookupError: # no IncrementalEncoder
1375 pass
1376 else:
1377 encodedresult = "".join(encoder.encode(c) for c in s)
1378 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1379 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1380 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001381
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001382 encodedresult = "".join(cencoder.encode(c) for c in s)
1383 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1384 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1385 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1386
Walter Dörwald729c31f2005-03-14 19:06:30 +00001387 def test_seek(self):
1388 # all codecs should be able to encode these
1389 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1390 for encoding in all_unicode_encodings:
1391 if encoding == "idna": # FIXME: See SF bug #1163178
1392 continue
1393 if encoding in broken_unicode_with_streams:
1394 continue
1395 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1396 for t in xrange(5):
1397 # Test that calling seek resets the internal codec state and buffers
1398 reader.seek(0, 0)
1399 line = reader.readline()
1400 self.assertEqual(s[:len(line)], line)
1401
Walter Dörwalde22d3392005-11-17 08:52:34 +00001402 def test_bad_decode_args(self):
1403 for encoding in all_unicode_encodings:
1404 decoder = codecs.getdecoder(encoding)
1405 self.assertRaises(TypeError, decoder)
1406 if encoding not in ("idna", "punycode"):
1407 self.assertRaises(TypeError, decoder, 42)
1408
1409 def test_bad_encode_args(self):
1410 for encoding in all_unicode_encodings:
1411 encoder = codecs.getencoder(encoding)
1412 self.assertRaises(TypeError, encoder)
1413
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001414 def test_encoding_map_type_initialized(self):
1415 from encodings import cp1140
1416 # This used to crash, we are only verifying there's no crash.
1417 table_type = type(cp1140.encoding_table)
1418 self.assertEqual(table_type, table_type)
1419
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001420class BasicStrTest(unittest.TestCase):
1421 def test_basics(self):
1422 s = "abc123"
1423 for encoding in all_string_encodings:
1424 (bytes, size) = codecs.getencoder(encoding)(s)
1425 self.assertEqual(size, len(s))
1426 (chars, size) = codecs.getdecoder(encoding)(bytes)
1427 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1428
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001429class CharmapTest(unittest.TestCase):
1430 def test_decode_with_string_map(self):
1431 self.assertEquals(
1432 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1433 (u"abc", 3)
1434 )
1435
1436 self.assertEquals(
1437 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1438 (u"ab\ufffd", 3)
1439 )
1440
1441 self.assertEquals(
1442 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1443 (u"ab\ufffd", 3)
1444 )
1445
1446 self.assertEquals(
1447 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1448 (u"ab", 3)
1449 )
1450
1451 self.assertEquals(
1452 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1453 (u"ab", 3)
1454 )
1455
1456 allbytes = "".join(chr(i) for i in xrange(256))
1457 self.assertEquals(
1458 codecs.charmap_decode(allbytes, "ignore", u""),
1459 (u"", len(allbytes))
1460 )
1461
Georg Brandl8f99f812006-10-29 08:39:22 +00001462class WithStmtTest(unittest.TestCase):
1463 def test_encodedfile(self):
1464 f = StringIO.StringIO("\xc3\xbc")
1465 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1466 self.assertEquals(ef.read(), "\xfc")
1467
1468 def test_streamreaderwriter(self):
1469 f = StringIO.StringIO("\xc3\xbc")
1470 info = codecs.lookup("utf-8")
1471 with codecs.StreamReaderWriter(f, info.streamreader,
1472 info.streamwriter, 'strict') as srw:
1473 self.assertEquals(srw.read(), u"\xfc")
1474
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001475
Fred Drake2e2be372001-09-20 21:33:42 +00001476def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001477 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001478 UTF32Test,
1479 UTF32LETest,
1480 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001481 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001482 UTF16LETest,
1483 UTF16BETest,
1484 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001485 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001486 UTF7Test,
1487 UTF16ExTest,
1488 ReadBufferTest,
1489 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001490 EscapeDecodeTest,
1491 RecodingTest,
1492 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001493 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001494 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001495 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001496 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001497 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001498 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001499 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001500 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001501 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001502 CharmapTest,
1503 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001504 )
Fred Drake2e2be372001-09-20 21:33:42 +00001505
1506
1507if __name__ == "__main__":
1508 test_main()