blob: 18933112a6ecfa0fb5274f5205d0d66a197c92cc [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001from test import test_support
2import unittest
Marc-André Lemburga37171d2001-06-19 20:09:28 +00003import codecs
Walter Dörwald9ae019b2006-03-18 14:22:26 +00004import sys, StringIO, _testcapi
Marc-André Lemburga37171d2001-06-19 20:09:28 +00005
Walter Dörwald69652032004-09-07 20:24:22 +00006class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
Walter Dörwalde57d7b12004-12-21 22:24:00 +000026class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000028 # get a StreamReader for the encoding and feed the bytestring version
Walter Dörwaldfc7e72d2007-11-19 12:14:05 +000029 # of input to the reader byte by byte. Read everything available from
Walter Dörwald69652032004-09-07 20:24:22 +000030 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
Walter Dörwalde57d7b12004-12-21 22:24:00 +000033 r = codecs.getreader(self.encoding)(q)
Walter Dörwald69652032004-09-07 20:24:22 +000034 result = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +000035 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
Walter Dörwald69652032004-09-07 20:24:22 +000036 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
Walter Dörwaldabb02e52006-03-15 11:35:15 +000044 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +000054 # Check whether the reset method works properly
Walter Dörwaldabb02e52006-03-15 11:35:15 +000055 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
Walter Dörwalde57d7b12004-12-21 22:24:00 +000071 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
Walter Dörwaldca199432006-03-06 22:39:12 +000076 def readalllines(input, keepends=True, size=None):
Walter Dörwalde57d7b12004-12-21 22:24:00 +000077 reader = getreader(input)
78 lines = []
79 while True:
Walter Dörwaldca199432006-03-06 22:39:12 +000080 line = reader.readline(size=size, keepends=keepends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000081 if not line:
82 break
83 lines.append(line)
Walter Dörwaldca199432006-03-06 22:39:12 +000084 return "|".join(lines)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000085
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
Walter Dörwaldca199432006-03-06 22:39:12 +000087 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
Walter Dörwalde57d7b12004-12-21 22:24:00 +000093
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000199 self.assertEqual(reader.readline(keepends=False), u"")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
Walter Dörwald43148c82005-04-21 21:45:36 +0000209 self.assertEqual(reader.readline(keepends=True), u"\n")
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
Walter Dörwald9fa09462005-01-10 12:01:39 +0000217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
Walter Dörwald6e390802007-08-17 16:41:28 +0000247class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000266 self.assertTrue(d == self.spamle or d == self.spambe)
Walter Dörwald6e390802007-08-17 16:41:28 +0000267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEquals(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
Georg Brandle9741f32009-09-17 11:28:09 +0000308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
313
Walter Dörwald6e390802007-08-17 16:41:28 +0000314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
317
318class UTF32LETest(ReadTest):
319 encoding = "utf-32-le"
320
321 def test_partial(self):
322 self.check_partial(
323 u"\x00\xff\u0100\uffff",
324 [
325 u"",
326 u"",
327 u"",
328 u"\x00",
329 u"\x00",
330 u"\x00",
331 u"\x00",
332 u"\x00\xff",
333 u"\x00\xff",
334 u"\x00\xff",
335 u"\x00\xff",
336 u"\x00\xff\u0100",
337 u"\x00\xff\u0100",
338 u"\x00\xff\u0100",
339 u"\x00\xff\u0100",
340 u"\x00\xff\u0100\uffff",
341 ]
342 )
343
344 def test_simple(self):
345 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
346
347 def test_errors(self):
348 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
349 "\xff", "strict", True)
350
351class UTF32BETest(ReadTest):
352 encoding = "utf-32-be"
353
354 def test_partial(self):
355 self.check_partial(
356 u"\x00\xff\u0100\uffff",
357 [
358 u"",
359 u"",
360 u"",
361 u"\x00",
362 u"\x00",
363 u"\x00",
364 u"\x00",
365 u"\x00\xff",
366 u"\x00\xff",
367 u"\x00\xff",
368 u"\x00\xff",
369 u"\x00\xff\u0100",
370 u"\x00\xff\u0100",
371 u"\x00\xff\u0100",
372 u"\x00\xff\u0100",
373 u"\x00\xff\u0100\uffff",
374 ]
375 )
376
377 def test_simple(self):
378 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
379
380 def test_errors(self):
381 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
382 "\xff", "strict", True)
383
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000384class UTF16Test(ReadTest):
385 encoding = "utf-16"
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000386
387 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
388 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
389
390 def test_only_one_bom(self):
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000391 _,_,reader,writer = codecs.lookup(self.encoding)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000392 # encode some stream
393 s = StringIO.StringIO()
394 f = writer(s)
395 f.write(u"spam")
396 f.write(u"spam")
397 d = s.getvalue()
398 # check whether there is exactly one BOM in it
Benjamin Peterson5c8da862009-06-30 22:57:08 +0000399 self.assertTrue(d == self.spamle or d == self.spambe)
Marc-André Lemburga37171d2001-06-19 20:09:28 +0000400 # try to read it back
401 s = StringIO.StringIO(d)
402 f = reader(s)
403 self.assertEquals(f.read(), u"spamspam")
404
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000405 def test_badbom(self):
406 s = StringIO.StringIO("\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000407 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000408 self.assertRaises(UnicodeError, f.read)
409
410 s = StringIO.StringIO("\xff\xff\xff\xff")
Walter Dörwalda9620d12005-02-08 10:10:01 +0000411 f = codecs.getreader(self.encoding)(s)
Walter Dörwald1f1d2522005-02-04 14:15:34 +0000412 self.assertRaises(UnicodeError, f.read)
413
Walter Dörwald69652032004-09-07 20:24:22 +0000414 def test_partial(self):
415 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000416 u"\x00\xff\u0100\uffff",
417 [
418 u"", # first byte of BOM read
419 u"", # second byte of BOM read => byteorder known
420 u"",
421 u"\x00",
422 u"\x00",
423 u"\x00\xff",
424 u"\x00\xff",
425 u"\x00\xff\u0100",
426 u"\x00\xff\u0100",
427 u"\x00\xff\u0100\uffff",
428 ]
429 )
430
Georg Brandle9741f32009-09-17 11:28:09 +0000431 def test_handlers(self):
432 self.assertEqual((u'\ufffd', 1),
433 codecs.utf_16_decode('\x01', 'replace', True))
434 self.assertEqual((u'', 1),
435 codecs.utf_16_decode('\x01', 'ignore', True))
436
Walter Dörwalde22d3392005-11-17 08:52:34 +0000437 def test_errors(self):
438 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
439
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000440class UTF16LETest(ReadTest):
441 encoding = "utf-16-le"
Walter Dörwald69652032004-09-07 20:24:22 +0000442
443 def test_partial(self):
444 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000445 u"\x00\xff\u0100\uffff",
446 [
447 u"",
448 u"\x00",
449 u"\x00",
450 u"\x00\xff",
451 u"\x00\xff",
452 u"\x00\xff\u0100",
453 u"\x00\xff\u0100",
454 u"\x00\xff\u0100\uffff",
455 ]
456 )
457
Walter Dörwalde22d3392005-11-17 08:52:34 +0000458 def test_errors(self):
459 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
460
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000461class UTF16BETest(ReadTest):
462 encoding = "utf-16-be"
Walter Dörwald69652032004-09-07 20:24:22 +0000463
464 def test_partial(self):
465 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000466 u"\x00\xff\u0100\uffff",
467 [
468 u"",
469 u"\x00",
470 u"\x00",
471 u"\x00\xff",
472 u"\x00\xff",
473 u"\x00\xff\u0100",
474 u"\x00\xff\u0100",
475 u"\x00\xff\u0100\uffff",
476 ]
477 )
478
Walter Dörwalde22d3392005-11-17 08:52:34 +0000479 def test_errors(self):
480 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
481
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000482class UTF8Test(ReadTest):
483 encoding = "utf-8"
Walter Dörwald69652032004-09-07 20:24:22 +0000484
485 def test_partial(self):
486 self.check_partial(
Walter Dörwald69652032004-09-07 20:24:22 +0000487 u"\x00\xff\u07ff\u0800\uffff",
488 [
489 u"\x00",
490 u"\x00",
491 u"\x00\xff",
492 u"\x00\xff",
493 u"\x00\xff\u07ff",
494 u"\x00\xff\u07ff",
495 u"\x00\xff\u07ff",
496 u"\x00\xff\u07ff\u0800",
497 u"\x00\xff\u07ff\u0800",
498 u"\x00\xff\u07ff\u0800",
499 u"\x00\xff\u07ff\u0800\uffff",
500 ]
501 )
502
Walter Dörwalde22d3392005-11-17 08:52:34 +0000503class UTF7Test(ReadTest):
504 encoding = "utf-7"
505
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +0000506 def test_partial(self):
507 self.check_partial(
508 u"a+-b",
509 [
510 u"a",
511 u"a",
512 u"a+",
513 u"a+-",
514 u"a+-b",
515 ]
516 )
Walter Dörwalde22d3392005-11-17 08:52:34 +0000517
518class UTF16ExTest(unittest.TestCase):
519
520 def test_errors(self):
521 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
522
523 def test_bad_args(self):
524 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
525
526class ReadBufferTest(unittest.TestCase):
527
528 def test_array(self):
529 import array
530 self.assertEqual(
531 codecs.readbuffer_encode(array.array("c", "spam")),
532 ("spam", 4)
533 )
534
535 def test_empty(self):
536 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
537
538 def test_bad_args(self):
539 self.assertRaises(TypeError, codecs.readbuffer_encode)
540 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
541
542class CharBufferTest(unittest.TestCase):
543
544 def test_string(self):
545 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
546
547 def test_empty(self):
548 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
549
550 def test_bad_args(self):
551 self.assertRaises(TypeError, codecs.charbuffer_encode)
552 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
553
Martin v. Löwis412ed3b2006-01-08 10:45:39 +0000554class UTF8SigTest(ReadTest):
555 encoding = "utf-8-sig"
556
557 def test_partial(self):
558 self.check_partial(
559 u"\ufeff\x00\xff\u07ff\u0800\uffff",
560 [
561 u"",
562 u"",
563 u"", # First BOM has been read and skipped
564 u"",
565 u"",
566 u"\ufeff", # Second BOM has been read and emitted
567 u"\ufeff\x00", # "\x00" read and emitted
568 u"\ufeff\x00", # First byte of encoded u"\xff" read
569 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
570 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
571 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
572 u"\ufeff\x00\xff\u07ff",
573 u"\ufeff\x00\xff\u07ff",
574 u"\ufeff\x00\xff\u07ff\u0800",
575 u"\ufeff\x00\xff\u07ff\u0800",
576 u"\ufeff\x00\xff\u07ff\u0800",
577 u"\ufeff\x00\xff\u07ff\u0800\uffff",
578 ]
579 )
580
Walter Dörwald39b8b6a2006-11-23 05:03:56 +0000581 def test_bug1601501(self):
582 # SF bug #1601501: check that the codec works with a buffer
583 unicode("\xef\xbb\xbf", "utf-8-sig")
584
Walter Dörwald42348272007-04-12 10:35:00 +0000585 def test_bom(self):
586 d = codecs.getincrementaldecoder("utf-8-sig")()
587 s = u"spam"
588 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
589
Walter Dörwald183744d2007-11-19 12:41:10 +0000590 def test_stream_bom(self):
591 unistring = u"ABC\u00A1\u2200XYZ"
592 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
593
594 reader = codecs.getreader("utf-8-sig")
595 for sizehint in [None] + range(1, 11) + \
596 [64, 128, 256, 512, 1024]:
597 istream = reader(StringIO.StringIO(bytestring))
598 ostream = StringIO.StringIO()
599 while 1:
600 if sizehint is not None:
601 data = istream.read(sizehint)
602 else:
603 data = istream.read()
604
605 if not data:
606 break
607 ostream.write(data)
608
609 got = ostream.getvalue()
610 self.assertEqual(got, unistring)
611
612 def test_stream_bare(self):
613 unistring = u"ABC\u00A1\u2200XYZ"
614 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
615
616 reader = codecs.getreader("utf-8-sig")
617 for sizehint in [None] + range(1, 11) + \
618 [64, 128, 256, 512, 1024]:
619 istream = reader(StringIO.StringIO(bytestring))
620 ostream = StringIO.StringIO()
621 while 1:
622 if sizehint is not None:
623 data = istream.read(sizehint)
624 else:
625 data = istream.read()
626
627 if not data:
628 break
629 ostream.write(data)
630
631 got = ostream.getvalue()
632 self.assertEqual(got, unistring)
633
Walter Dörwald8709a422002-09-03 13:53:40 +0000634class EscapeDecodeTest(unittest.TestCase):
Walter Dörwalde22d3392005-11-17 08:52:34 +0000635 def test_empty(self):
Walter Dörwald8709a422002-09-03 13:53:40 +0000636 self.assertEquals(codecs.escape_decode(""), ("", 0))
637
Marc-André Lemburg29273c82003-02-04 19:35:03 +0000638class RecodingTest(unittest.TestCase):
639 def test_recoding(self):
640 f = StringIO.StringIO()
641 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
642 f2.write(u"a")
643 f2.close()
644 # Python used to crash on this at exit because of a refcount
645 # bug in _codecsmodule.c
Fred Drake2e2be372001-09-20 21:33:42 +0000646
Martin v. Löwis2548c732003-04-18 10:39:54 +0000647# From RFC 3492
648punycode_testcases = [
649 # A Arabic (Egyptian):
650 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
651 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
652 "egbpdaj6bu4bxfgehfvwxn"),
653 # B Chinese (simplified):
654 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
655 "ihqwcrb4cv8a8dqg056pqjye"),
656 # C Chinese (traditional):
657 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
658 "ihqwctvzc91f659drss3x8bo0yb"),
659 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
660 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
661 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
662 u"\u0065\u0073\u006B\u0079",
663 "Proprostnemluvesky-uyb24dma41a"),
664 # E Hebrew:
665 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
666 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
667 u"\u05D1\u05E8\u05D9\u05EA",
668 "4dbcagdahymbxekheh6e0a7fei0b"),
669 # F Hindi (Devanagari):
670 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
671 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
672 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
673 u"\u0939\u0948\u0902",
674 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
675
676 #(G) Japanese (kanji and hiragana):
677 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
678 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
679 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
680
681 # (H) Korean (Hangul syllables):
682 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
683 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
684 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
685 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
686 "psd879ccm6fea98c"),
687
688 # (I) Russian (Cyrillic):
689 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
690 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
691 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
692 u"\u0438",
693 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
694
695 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
696 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
697 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
698 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
699 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
700 u"\u0061\u00F1\u006F\u006C",
701 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
702
703 # (K) Vietnamese:
704 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
705 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
706 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
707 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
708 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
709 u"\u0056\u0069\u1EC7\u0074",
710 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
711
Martin v. Löwis2548c732003-04-18 10:39:54 +0000712 #(L) 3<nen>B<gumi><kinpachi><sensei>
713 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
714 "3B-ww4c5e180e575a65lsy2b"),
Tim Peters0eadaac2003-04-24 16:02:54 +0000715
Martin v. Löwis2548c732003-04-18 10:39:54 +0000716 # (M) <amuro><namie>-with-SUPER-MONKEYS
717 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
718 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
719 u"\u004F\u004E\u004B\u0045\u0059\u0053",
720 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
721
722 # (N) Hello-Another-Way-<sorezore><no><basho>
723 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
724 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
725 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
726 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
727
728 # (O) <hitotsu><yane><no><shita>2
729 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
730 "2-u9tlzr9756bt3uc0v"),
731
732 # (P) Maji<de>Koi<suru>5<byou><mae>
733 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
734 u"\u308B\u0035\u79D2\u524D",
735 "MajiKoi5-783gue6qz075azm5e"),
736
737 # (Q) <pafii>de<runba>
738 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
739 "de-jg4avhby1noc0d"),
740
741 # (R) <sono><supiido><de>
742 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
743 "d9juau41awczczp"),
744
745 # (S) -> $1.00 <-
746 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
747 u"\u003C\u002D",
748 "-> $1.00 <--")
749 ]
750
751for i in punycode_testcases:
752 if len(i)!=2:
753 print repr(i)
754
755class PunycodeTest(unittest.TestCase):
756 def test_encode(self):
757 for uni, puny in punycode_testcases:
758 # Need to convert both strings to lower case, since
759 # some of the extended encodings use upper case, but our
760 # code produces only lower case. Converting just puny to
761 # lower is also insufficient, since some of the input characters
762 # are upper case.
763 self.assertEquals(uni.encode("punycode").lower(), puny.lower())
764
765 def test_decode(self):
766 for uni, puny in punycode_testcases:
767 self.assertEquals(uni, puny.decode("punycode"))
768
Walter Dörwalda47d1c02005-08-30 10:23:14 +0000769class UnicodeInternalTest(unittest.TestCase):
770 def test_bug1251300(self):
771 # Decoding with unicode_internal used to not correctly handle "code
772 # points" above 0x10ffff on UCS-4 builds.
773 if sys.maxunicode > 0xffff:
774 ok = [
775 ("\x00\x10\xff\xff", u"\U0010ffff"),
776 ("\x00\x00\x01\x01", u"\U00000101"),
777 ("", u""),
778 ]
779 not_ok = [
780 "\x7f\xff\xff\xff",
781 "\x80\x00\x00\x00",
782 "\x81\x00\x00\x00",
783 "\x00",
784 "\x00\x00\x00\x00\x00",
785 ]
786 for internal, uni in ok:
787 if sys.byteorder == "little":
788 internal = "".join(reversed(internal))
789 self.assertEquals(uni, internal.decode("unicode_internal"))
790 for internal in not_ok:
791 if sys.byteorder == "little":
792 internal = "".join(reversed(internal))
793 self.assertRaises(UnicodeDecodeError, internal.decode,
794 "unicode_internal")
795
796 def test_decode_error_attributes(self):
797 if sys.maxunicode > 0xffff:
798 try:
799 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
800 except UnicodeDecodeError, ex:
801 self.assertEquals("unicode_internal", ex.encoding)
802 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
803 self.assertEquals(4, ex.start)
804 self.assertEquals(8, ex.end)
805 else:
806 self.fail()
807
808 def test_decode_callback(self):
809 if sys.maxunicode > 0xffff:
810 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
811 decoder = codecs.getdecoder("unicode_internal")
812 ab = u"ab".encode("unicode_internal")
813 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
814 "UnicodeInternalTest")
815 self.assertEquals((u"ab", 12), ignored)
816
Walter Dörwalda7fb4082009-05-06 14:28:24 +0000817 def test_encode_length(self):
818 # Issue 3739
819 encoder = codecs.getencoder("unicode_internal")
820 self.assertEquals(encoder(u"a")[1], 1)
821 self.assertEquals(encoder(u"\xe9\u0142")[1], 2)
822
Martin v. Löwis2548c732003-04-18 10:39:54 +0000823# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
824nameprep_tests = [
825 # 3.1 Map to nothing.
826 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
827 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
828 '\xb8\x8f\xef\xbb\xbf',
829 'foobarbaz'),
830 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
831 ('CAFE',
832 'cafe'),
833 # 3.3 Case folding 8bit U+00DF (german sharp s).
834 # The original test case is bogus; it says \xc3\xdf
835 ('\xc3\x9f',
836 'ss'),
837 # 3.4 Case folding U+0130 (turkish capital I with dot).
838 ('\xc4\xb0',
839 'i\xcc\x87'),
840 # 3.5 Case folding multibyte U+0143 U+037A.
841 ('\xc5\x83\xcd\xba',
842 '\xc5\x84 \xce\xb9'),
843 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
844 # XXX: skip this as it fails in UCS-2 mode
845 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
846 # 'telc\xe2\x88\x95kg\xcf\x83'),
847 (None, None),
848 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
849 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
850 '\xc7\xb0 a'),
851 # 3.8 Case folding U+1FB7 and normalization.
852 ('\xe1\xbe\xb7',
853 '\xe1\xbe\xb6\xce\xb9'),
854 # 3.9 Self-reverting case folding U+01F0 and normalization.
855 # The original test case is bogus, it says `\xc7\xf0'
856 ('\xc7\xb0',
857 '\xc7\xb0'),
858 # 3.10 Self-reverting case folding U+0390 and normalization.
859 ('\xce\x90',
860 '\xce\x90'),
861 # 3.11 Self-reverting case folding U+03B0 and normalization.
862 ('\xce\xb0',
863 '\xce\xb0'),
864 # 3.12 Self-reverting case folding U+1E96 and normalization.
865 ('\xe1\xba\x96',
866 '\xe1\xba\x96'),
867 # 3.13 Self-reverting case folding U+1F56 and normalization.
868 ('\xe1\xbd\x96',
869 '\xe1\xbd\x96'),
870 # 3.14 ASCII space character U+0020.
871 (' ',
872 ' '),
873 # 3.15 Non-ASCII 8bit space character U+00A0.
874 ('\xc2\xa0',
875 ' '),
876 # 3.16 Non-ASCII multibyte space character U+1680.
877 ('\xe1\x9a\x80',
878 None),
879 # 3.17 Non-ASCII multibyte space character U+2000.
880 ('\xe2\x80\x80',
881 ' '),
882 # 3.18 Zero Width Space U+200b.
883 ('\xe2\x80\x8b',
884 ''),
885 # 3.19 Non-ASCII multibyte space character U+3000.
886 ('\xe3\x80\x80',
887 ' '),
888 # 3.20 ASCII control characters U+0010 U+007F.
889 ('\x10\x7f',
890 '\x10\x7f'),
891 # 3.21 Non-ASCII 8bit control character U+0085.
892 ('\xc2\x85',
893 None),
894 # 3.22 Non-ASCII multibyte control character U+180E.
895 ('\xe1\xa0\x8e',
896 None),
897 # 3.23 Zero Width No-Break Space U+FEFF.
898 ('\xef\xbb\xbf',
899 ''),
900 # 3.24 Non-ASCII control character U+1D175.
901 ('\xf0\x9d\x85\xb5',
902 None),
903 # 3.25 Plane 0 private use character U+F123.
904 ('\xef\x84\xa3',
905 None),
906 # 3.26 Plane 15 private use character U+F1234.
907 ('\xf3\xb1\x88\xb4',
908 None),
909 # 3.27 Plane 16 private use character U+10F234.
910 ('\xf4\x8f\x88\xb4',
911 None),
912 # 3.28 Non-character code point U+8FFFE.
913 ('\xf2\x8f\xbf\xbe',
914 None),
915 # 3.29 Non-character code point U+10FFFF.
916 ('\xf4\x8f\xbf\xbf',
917 None),
918 # 3.30 Surrogate code U+DF42.
919 ('\xed\xbd\x82',
920 None),
921 # 3.31 Non-plain text character U+FFFD.
922 ('\xef\xbf\xbd',
923 None),
924 # 3.32 Ideographic description character U+2FF5.
925 ('\xe2\xbf\xb5',
926 None),
927 # 3.33 Display property character U+0341.
Tim Peters0eadaac2003-04-24 16:02:54 +0000928 ('\xcd\x81',
Martin v. Löwis2548c732003-04-18 10:39:54 +0000929 '\xcc\x81'),
930 # 3.34 Left-to-right mark U+200E.
931 ('\xe2\x80\x8e',
932 None),
933 # 3.35 Deprecated U+202A.
934 ('\xe2\x80\xaa',
935 None),
936 # 3.36 Language tagging character U+E0001.
937 ('\xf3\xa0\x80\x81',
938 None),
939 # 3.37 Language tagging character U+E0042.
940 ('\xf3\xa0\x81\x82',
941 None),
942 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
943 ('foo\xd6\xbebar',
944 None),
945 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
946 ('foo\xef\xb5\x90bar',
947 None),
948 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
949 ('foo\xef\xb9\xb6bar',
950 'foo \xd9\x8ebar'),
951 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
952 ('\xd8\xa71',
953 None),
954 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
955 ('\xd8\xa71\xd8\xa8',
956 '\xd8\xa71\xd8\xa8'),
957 # 3.43 Unassigned code point U+E0002.
Martin v. Löwisb5c4b7b2003-04-18 20:21:00 +0000958 # Skip this test as we allow unassigned
959 #('\xf3\xa0\x80\x82',
960 # None),
961 (None, None),
Martin v. Löwis2548c732003-04-18 10:39:54 +0000962 # 3.44 Larger test (shrinking).
963 # Original test case reads \xc3\xdf
964 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
965 '\xaa\xce\xb0\xe2\x80\x80',
966 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
967 # 3.45 Larger test (expanding).
968 # Original test case reads \xc3\x9f
969 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
970 '\x80',
971 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
972 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
973 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
974 ]
975
976
977class NameprepTest(unittest.TestCase):
978 def test_nameprep(self):
979 from encodings.idna import nameprep
980 for pos, (orig, prepped) in enumerate(nameprep_tests):
981 if orig is None:
982 # Skipped
983 continue
984 # The Unicode strings are given in UTF-8
985 orig = unicode(orig, "utf-8")
986 if prepped is None:
987 # Input contains prohibited characters
988 self.assertRaises(UnicodeError, nameprep, orig)
989 else:
990 prepped = unicode(prepped, "utf-8")
991 try:
992 self.assertEquals(nameprep(orig), prepped)
993 except Exception,e:
994 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
995
Walter Dörwald78a0be62006-04-14 18:25:39 +0000996class IDNACodecTest(unittest.TestCase):
997 def test_builtin_decode(self):
Martin v. Löwisa1dde132004-03-24 16:48:24 +0000998 self.assertEquals(unicode("python.org", "idna"), u"python.org")
Walter Dörwald78a0be62006-04-14 18:25:39 +0000999 self.assertEquals(unicode("python.org.", "idna"), u"python.org.")
1000 self.assertEquals(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1001 self.assertEquals(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
1002
1003 def test_builtin_encode(self):
1004 self.assertEquals(u"python.org".encode("idna"), "python.org")
1005 self.assertEquals("python.org.".encode("idna"), "python.org.")
1006 self.assertEquals(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1007 self.assertEquals(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001008
Martin v. Löwis8b595142005-08-25 11:03:38 +00001009 def test_stream(self):
1010 import StringIO
1011 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1012 r.read(3)
1013 self.assertEquals(r.read(), u"")
1014
Walter Dörwald78a0be62006-04-14 18:25:39 +00001015 def test_incremental_decode(self):
1016 self.assertEquals(
1017 "".join(codecs.iterdecode("python.org", "idna")),
1018 u"python.org"
1019 )
1020 self.assertEquals(
1021 "".join(codecs.iterdecode("python.org.", "idna")),
1022 u"python.org."
1023 )
1024 self.assertEquals(
1025 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1026 u"pyth\xf6n.org."
1027 )
1028 self.assertEquals(
1029 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1030 u"pyth\xf6n.org."
1031 )
1032
1033 decoder = codecs.getincrementaldecoder("idna")()
1034 self.assertEquals(decoder.decode("xn--xam", ), u"")
1035 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1036 self.assertEquals(decoder.decode(u"rg"), u"")
1037 self.assertEquals(decoder.decode(u"", True), u"org")
1038
1039 decoder.reset()
1040 self.assertEquals(decoder.decode("xn--xam", ), u"")
1041 self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1042 self.assertEquals(decoder.decode("rg."), u"org.")
1043 self.assertEquals(decoder.decode("", True), u"")
1044
1045 def test_incremental_encode(self):
1046 self.assertEquals(
1047 "".join(codecs.iterencode(u"python.org", "idna")),
1048 "python.org"
1049 )
1050 self.assertEquals(
1051 "".join(codecs.iterencode(u"python.org.", "idna")),
1052 "python.org."
1053 )
1054 self.assertEquals(
1055 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1056 "xn--pythn-mua.org."
1057 )
1058 self.assertEquals(
1059 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1060 "xn--pythn-mua.org."
1061 )
1062
1063 encoder = codecs.getincrementalencoder("idna")()
1064 self.assertEquals(encoder.encode(u"\xe4x"), "")
1065 self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1066 self.assertEquals(encoder.encode(u"", True), "org")
1067
1068 encoder.reset()
1069 self.assertEquals(encoder.encode(u"\xe4x"), "")
1070 self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1071 self.assertEquals(encoder.encode(u"", True), "")
1072
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001073class CodecsModuleTest(unittest.TestCase):
1074
1075 def test_decode(self):
1076 self.assertEquals(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1077 u'\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001078 self.assertRaises(TypeError, codecs.decode)
1079 self.assertEquals(codecs.decode('abc'), u'abc')
1080 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1081
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001082 def test_encode(self):
1083 self.assertEquals(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1084 '\xe4\xf6\xfc')
Walter Dörwald063e1e82004-10-28 13:04:26 +00001085 self.assertRaises(TypeError, codecs.encode)
Walter Dörwald690402f2005-11-17 18:51:34 +00001086 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
Walter Dörwald063e1e82004-10-28 13:04:26 +00001087 self.assertEquals(codecs.encode(u'abc'), 'abc')
1088 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1089
1090 def test_register(self):
1091 self.assertRaises(TypeError, codecs.register)
Walter Dörwald690402f2005-11-17 18:51:34 +00001092 self.assertRaises(TypeError, codecs.register, 42)
Walter Dörwald063e1e82004-10-28 13:04:26 +00001093
1094 def test_lookup(self):
1095 self.assertRaises(TypeError, codecs.lookup)
1096 self.assertRaises(LookupError, codecs.lookup, "__spam__")
Walter Dörwald690402f2005-11-17 18:51:34 +00001097 self.assertRaises(LookupError, codecs.lookup, " ")
1098
1099 def test_getencoder(self):
1100 self.assertRaises(TypeError, codecs.getencoder)
1101 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1102
1103 def test_getdecoder(self):
1104 self.assertRaises(TypeError, codecs.getdecoder)
1105 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1106
1107 def test_getreader(self):
1108 self.assertRaises(TypeError, codecs.getreader)
1109 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1110
1111 def test_getwriter(self):
1112 self.assertRaises(TypeError, codecs.getwriter)
1113 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
Marc-André Lemburg3f419742004-07-10 12:06:10 +00001114
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001115class StreamReaderTest(unittest.TestCase):
1116
1117 def setUp(self):
1118 self.reader = codecs.getreader('utf-8')
1119 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1120
1121 def test_readlines(self):
1122 f = self.reader(self.stream)
1123 self.assertEquals(f.readlines(), [u'\ud55c\n', u'\uae00'])
1124
Georg Brandl8f99f812006-10-29 08:39:22 +00001125class EncodedFileTest(unittest.TestCase):
Tim Petersabd8a332006-11-03 02:32:46 +00001126
Georg Brandl8f99f812006-10-29 08:39:22 +00001127 def test_basic(self):
1128 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Georg Brandl5b4e1c22006-10-29 09:32:16 +00001129 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1130 self.assertEquals(ef.read(), '\\\xd5\n\x00\x00\xae')
Georg Brandl8f99f812006-10-29 08:39:22 +00001131
1132 f = StringIO.StringIO()
1133 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1134 ef.write('\xc3\xbc')
1135 self.assertEquals(f.getvalue(), '\xfc')
1136
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001137class Str2StrTest(unittest.TestCase):
1138
1139 def test_read(self):
1140 sin = "\x80".encode("base64_codec")
1141 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1142 sout = reader.read()
1143 self.assertEqual(sout, "\x80")
Benjamin Peterson5c8da862009-06-30 22:57:08 +00001144 self.assertTrue(isinstance(sout, str))
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001145
1146 def test_readline(self):
1147 sin = "\x80".encode("base64_codec")
1148 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1149 sout = reader.readline()
1150 self.assertEqual(sout, "\x80")
Benjamin Peterson5c8da862009-06-30 22:57:08 +00001151 self.assertTrue(isinstance(sout, str))
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001152
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001153all_unicode_encodings = [
1154 "ascii",
1155 "base64_codec",
1156 "big5",
1157 "big5hkscs",
1158 "charmap",
1159 "cp037",
1160 "cp1006",
1161 "cp1026",
1162 "cp1140",
1163 "cp1250",
1164 "cp1251",
1165 "cp1252",
1166 "cp1253",
1167 "cp1254",
1168 "cp1255",
1169 "cp1256",
1170 "cp1257",
1171 "cp1258",
1172 "cp424",
1173 "cp437",
1174 "cp500",
1175 "cp737",
1176 "cp775",
1177 "cp850",
1178 "cp852",
1179 "cp855",
1180 "cp856",
1181 "cp857",
1182 "cp860",
1183 "cp861",
1184 "cp862",
1185 "cp863",
1186 "cp864",
1187 "cp865",
1188 "cp866",
1189 "cp869",
1190 "cp874",
1191 "cp875",
1192 "cp932",
1193 "cp949",
1194 "cp950",
1195 "euc_jis_2004",
1196 "euc_jisx0213",
1197 "euc_jp",
1198 "euc_kr",
1199 "gb18030",
1200 "gb2312",
1201 "gbk",
1202 "hex_codec",
1203 "hp_roman8",
1204 "hz",
1205 "idna",
1206 "iso2022_jp",
1207 "iso2022_jp_1",
1208 "iso2022_jp_2",
1209 "iso2022_jp_2004",
1210 "iso2022_jp_3",
1211 "iso2022_jp_ext",
1212 "iso2022_kr",
1213 "iso8859_1",
1214 "iso8859_10",
1215 "iso8859_11",
1216 "iso8859_13",
1217 "iso8859_14",
1218 "iso8859_15",
1219 "iso8859_16",
1220 "iso8859_2",
1221 "iso8859_3",
1222 "iso8859_4",
1223 "iso8859_5",
1224 "iso8859_6",
1225 "iso8859_7",
1226 "iso8859_8",
1227 "iso8859_9",
1228 "johab",
1229 "koi8_r",
1230 "koi8_u",
1231 "latin_1",
1232 "mac_cyrillic",
1233 "mac_greek",
1234 "mac_iceland",
1235 "mac_latin2",
1236 "mac_roman",
1237 "mac_turkish",
1238 "palmos",
1239 "ptcp154",
1240 "punycode",
1241 "raw_unicode_escape",
1242 "rot_13",
1243 "shift_jis",
1244 "shift_jis_2004",
1245 "shift_jisx0213",
1246 "tis_620",
1247 "unicode_escape",
1248 "unicode_internal",
1249 "utf_16",
1250 "utf_16_be",
1251 "utf_16_le",
1252 "utf_7",
1253 "utf_8",
1254]
1255
1256if hasattr(codecs, "mbcs_encode"):
1257 all_unicode_encodings.append("mbcs")
1258
1259# The following encodings work only with str, not unicode
1260all_string_encodings = [
1261 "quopri_codec",
1262 "string_escape",
1263 "uu_codec",
1264]
1265
1266# The following encoding is not tested, because it's not supposed
1267# to work:
1268# "undefined"
1269
1270# The following encodings don't work in stateful mode
1271broken_unicode_with_streams = [
1272 "base64_codec",
1273 "hex_codec",
1274 "punycode",
1275 "unicode_internal"
1276]
Georg Brandl2c9838e2006-10-29 14:39:09 +00001277broken_incremental_coders = broken_unicode_with_streams[:]
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001278
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001279# The following encodings only support "strict" mode
1280only_strict_mode = [
1281 "idna",
1282 "zlib_codec",
Neal Norwitz1ead6982006-10-29 23:58:36 +00001283 "bz2_codec",
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001284]
1285
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001286try:
1287 import bz2
1288except ImportError:
1289 pass
1290else:
1291 all_unicode_encodings.append("bz2_codec")
1292 broken_unicode_with_streams.append("bz2_codec")
1293
1294try:
1295 import zlib
1296except ImportError:
1297 pass
1298else:
1299 all_unicode_encodings.append("zlib_codec")
1300 broken_unicode_with_streams.append("zlib_codec")
1301
1302class BasicUnicodeTest(unittest.TestCase):
1303 def test_basics(self):
1304 s = u"abc123" # all codecs should be able to encode these
1305 for encoding in all_unicode_encodings:
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001306 name = codecs.lookup(encoding).name
1307 if encoding.endswith("_codec"):
1308 name += "_codec"
1309 elif encoding == "latin_1":
1310 name = "latin_1"
1311 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001312 (bytes, size) = codecs.getencoder(encoding)(s)
Walter Dörwalda7fb4082009-05-06 14:28:24 +00001313 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001314 (chars, size) = codecs.getdecoder(encoding)(bytes)
1315 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1316
1317 if encoding not in broken_unicode_with_streams:
1318 # check stream reader/writer
1319 q = Queue()
1320 writer = codecs.getwriter(encoding)(q)
1321 encodedresult = ""
1322 for c in s:
1323 writer.write(c)
1324 encodedresult += q.read()
1325 q = Queue()
1326 reader = codecs.getreader(encoding)(q)
1327 decodedresult = u""
1328 for c in encodedresult:
1329 q.write(c)
1330 decodedresult += reader.read()
1331 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1332
Georg Brandl2c9838e2006-10-29 14:39:09 +00001333 if encoding not in broken_incremental_coders:
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001334 # check incremental decoder/encoder (fetched via the Python
1335 # and C API) and iterencode()/iterdecode()
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001336 try:
1337 encoder = codecs.getincrementalencoder(encoding)()
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001338 cencoder = _testcapi.codec_incrementalencoder(encoding)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001339 except LookupError: # no IncrementalEncoder
1340 pass
1341 else:
1342 # check incremental decoder/encoder
1343 encodedresult = ""
1344 for c in s:
1345 encodedresult += encoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001346 encodedresult += encoder.encode(u"", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001347 decoder = codecs.getincrementaldecoder(encoding)()
1348 decodedresult = u""
1349 for c in encodedresult:
1350 decodedresult += decoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001351 decodedresult += decoder.decode("", True)
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001352 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1353
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001354 # check C API
1355 encodedresult = ""
1356 for c in s:
1357 encodedresult += cencoder.encode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001358 encodedresult += cencoder.encode(u"", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001359 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1360 decodedresult = u""
1361 for c in encodedresult:
1362 decodedresult += cdecoder.decode(c)
Walter Dörwald15be5ec2006-04-14 14:03:55 +00001363 decodedresult += cdecoder.decode("", True)
Walter Dörwald9ae019b2006-03-18 14:22:26 +00001364 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1365
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001366 # check iterencode()/iterdecode()
1367 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1368 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1369
1370 # check iterencode()/iterdecode() with empty string
1371 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1372 self.assertEqual(result, u"")
1373
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001374 if encoding not in only_strict_mode:
1375 # check incremental decoder/encoder with errors argument
1376 try:
1377 encoder = codecs.getincrementalencoder(encoding)("ignore")
1378 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1379 except LookupError: # no IncrementalEncoder
1380 pass
1381 else:
1382 encodedresult = "".join(encoder.encode(c) for c in s)
1383 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1384 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1385 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
Tim Petersabd8a332006-11-03 02:32:46 +00001386
Walter Dörwald98c70ac2006-10-29 23:02:27 +00001387 encodedresult = "".join(cencoder.encode(c) for c in s)
1388 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1389 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1390 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1391
Walter Dörwald729c31f2005-03-14 19:06:30 +00001392 def test_seek(self):
1393 # all codecs should be able to encode these
1394 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1395 for encoding in all_unicode_encodings:
1396 if encoding == "idna": # FIXME: See SF bug #1163178
1397 continue
1398 if encoding in broken_unicode_with_streams:
1399 continue
1400 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1401 for t in xrange(5):
1402 # Test that calling seek resets the internal codec state and buffers
1403 reader.seek(0, 0)
1404 line = reader.readline()
1405 self.assertEqual(s[:len(line)], line)
1406
Walter Dörwalde22d3392005-11-17 08:52:34 +00001407 def test_bad_decode_args(self):
1408 for encoding in all_unicode_encodings:
1409 decoder = codecs.getdecoder(encoding)
1410 self.assertRaises(TypeError, decoder)
1411 if encoding not in ("idna", "punycode"):
1412 self.assertRaises(TypeError, decoder, 42)
1413
1414 def test_bad_encode_args(self):
1415 for encoding in all_unicode_encodings:
1416 encoder = codecs.getencoder(encoding)
1417 self.assertRaises(TypeError, encoder)
1418
Neal Norwitz6d3d3392006-06-13 08:41:06 +00001419 def test_encoding_map_type_initialized(self):
1420 from encodings import cp1140
1421 # This used to crash, we are only verifying there's no crash.
1422 table_type = type(cp1140.encoding_table)
1423 self.assertEqual(table_type, table_type)
1424
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001425class BasicStrTest(unittest.TestCase):
1426 def test_basics(self):
1427 s = "abc123"
1428 for encoding in all_string_encodings:
1429 (bytes, size) = codecs.getencoder(encoding)(s)
1430 self.assertEqual(size, len(s))
1431 (chars, size) = codecs.getdecoder(encoding)(bytes)
1432 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1433
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001434class CharmapTest(unittest.TestCase):
1435 def test_decode_with_string_map(self):
1436 self.assertEquals(
1437 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1438 (u"abc", 3)
1439 )
1440
1441 self.assertEquals(
1442 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1443 (u"ab\ufffd", 3)
1444 )
1445
1446 self.assertEquals(
1447 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1448 (u"ab\ufffd", 3)
1449 )
1450
1451 self.assertEquals(
1452 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1453 (u"ab", 3)
1454 )
1455
1456 self.assertEquals(
1457 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1458 (u"ab", 3)
1459 )
1460
1461 allbytes = "".join(chr(i) for i in xrange(256))
1462 self.assertEquals(
1463 codecs.charmap_decode(allbytes, "ignore", u""),
1464 (u"", len(allbytes))
1465 )
1466
Georg Brandl8f99f812006-10-29 08:39:22 +00001467class WithStmtTest(unittest.TestCase):
1468 def test_encodedfile(self):
1469 f = StringIO.StringIO("\xc3\xbc")
1470 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1471 self.assertEquals(ef.read(), "\xfc")
1472
1473 def test_streamreaderwriter(self):
1474 f = StringIO.StringIO("\xc3\xbc")
1475 info = codecs.lookup("utf-8")
1476 with codecs.StreamReaderWriter(f, info.streamreader,
1477 info.streamwriter, 'strict') as srw:
1478 self.assertEquals(srw.read(), u"\xfc")
1479
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001480
Fred Drake2e2be372001-09-20 21:33:42 +00001481def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001482 test_support.run_unittest(
Walter Dörwald6e390802007-08-17 16:41:28 +00001483 UTF32Test,
1484 UTF32LETest,
1485 UTF32BETest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001486 UTF16Test,
Walter Dörwald69652032004-09-07 20:24:22 +00001487 UTF16LETest,
1488 UTF16BETest,
1489 UTF8Test,
Martin v. Löwis412ed3b2006-01-08 10:45:39 +00001490 UTF8SigTest,
Walter Dörwalde22d3392005-11-17 08:52:34 +00001491 UTF7Test,
1492 UTF16ExTest,
1493 ReadBufferTest,
1494 CharBufferTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001495 EscapeDecodeTest,
1496 RecodingTest,
1497 PunycodeTest,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00001498 UnicodeInternalTest,
Martin v. Löwisa1dde132004-03-24 16:48:24 +00001499 NameprepTest,
Walter Dörwald78a0be62006-04-14 18:25:39 +00001500 IDNACodecTest,
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +00001501 CodecsModuleTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001502 StreamReaderTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001503 EncodedFileTest,
Walter Dörwaldc9878e12005-07-20 22:15:39 +00001504 Str2StrTest,
Walter Dörwaldee1d2472004-12-29 16:04:38 +00001505 BasicUnicodeTest,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00001506 BasicStrTest,
Georg Brandl8f99f812006-10-29 08:39:22 +00001507 CharmapTest,
1508 WithStmtTest,
Walter Dörwald21d3a322003-05-01 17:45:56 +00001509 )
Fred Drake2e2be372001-09-20 21:33:42 +00001510
1511
1512if __name__ == "__main__":
1513 test_main()